Moodle 4.1 XRef and Diffs

Search moodle.org's
Developer Documentation
See Release Notes
Long Term Support Release
Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.
Moodle 4.1 Database Schema (by Marcus Green)
/lib/htmlpurifier/HTMLPurifier/ -> Encoder.php (source)
Differences Between: [Versions 310 and 401] [Versions 311 and 401] [Versions 39 and 401] [Versions 400 and 401]
   1  <?php
   2  
   3  /**
   4   * A UTF-8 specific character encoder that handles cleaning and transforming.
   5   * @note All functions in this class should be static.
   6   */
   7  class HTMLPurifier_Encoder
   8  {
   9  
  10      /**
  11       * Constructor throws fatal error if you attempt to instantiate class
  12       */
  13      private function __construct()
  14      {
  15          trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
  16      }
  17  
  18      /**
  19       * Error-handler that mutes errors, alternative to shut-up operator.
  20       */
  21      public static function muteErrorHandler()
  22      {
  23      }
  24  
  25      /**
  26       * iconv wrapper which mutes errors, but doesn't work around bugs.
  27       * @param string $in Input encoding
  28       * @param string $out Output encoding
  29       * @param string $text The text to convert
  30       * @return string
  31       */
  32      public static function unsafeIconv($in, $out, $text)
  33      {
  34          set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
  35          $r = iconv($in, $out, $text);
  36          restore_error_handler();
  37          return $r;
  38      }
  39  
  40      /**
  41       * iconv wrapper which mutes errors and works around bugs.
  42       * @param string $in Input encoding
  43       * @param string $out Output encoding
  44       * @param string $text The text to convert
  45       * @param int $max_chunk_size
  46       * @return string
  47       */
  48      public static function iconv($in, $out, $text, $max_chunk_size = 8000)
  49      {
  50          $code = self::testIconvTruncateBug();
  51          if ($code == self::ICONV_OK) {
  52              return self::unsafeIconv($in, $out, $text);
  53          } elseif ($code == self::ICONV_TRUNCATES) {
  54              // we can only work around this if the input character set
  55              // is utf-8
  56              if ($in == 'utf-8') {
  57                  if ($max_chunk_size < 4) {
  58                      trigger_error('max_chunk_size is too small', E_USER_WARNING);
  59                      return false;
  60                  }
  61                  // split into 8000 byte chunks, but be careful to handle
  62                  // multibyte boundaries properly
  63                  if (($c = strlen($text)) <= $max_chunk_size) {
  64                      return self::unsafeIconv($in, $out, $text);
  65                  }
  66                  $r = '';
  67                  $i = 0;
  68                  while (true) {
  69                      if ($i + $max_chunk_size >= $c) {
  70                          $r .= self::unsafeIconv($in, $out, substr($text, $i));
  71                          break;
  72                      }
  73                      // wibble the boundary
  74                      if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
  75                          $chunk_size = $max_chunk_size;
  76                      } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
  77                          $chunk_size = $max_chunk_size - 1;
  78                      } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
  79                          $chunk_size = $max_chunk_size - 2;
  80                      } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
  81                          $chunk_size = $max_chunk_size - 3;
  82                      } else {
  83                          return false; // rather confusing UTF-8...
  84                      }
  85                      $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths
  86                      $r .= self::unsafeIconv($in, $out, $chunk);
  87                      $i += $chunk_size;
  88                  }
  89                  return $r;
  90              } else {
  91                  return false;
  92              }
  93          } else {
  94              return false;
  95          }
  96      }
  97  
  98      /**
  99       * Cleans a UTF-8 string for well-formedness and SGML validity
 100       *
 101       * It will parse according to UTF-8 and return a valid UTF8 string, with
 102       * non-SGML codepoints excluded.
 103       *
 104       * Specifically, it will permit:
 105       * \x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}
 106       * Source: https://www.w3.org/TR/REC-xml/#NT-Char
 107       * Arguably this function should be modernized to the HTML5 set
 108       * of allowed characters:
 109       * https://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
 110       * which simultaneously expand and restrict the set of allowed characters.
 111       *
 112       * @param string $str The string to clean
 113       * @param bool $force_php
 114       * @return string
 115       *
 116       * @note Just for reference, the non-SGML code points are 0 to 31 and
 117       *       127 to 159, inclusive.  However, we allow code points 9, 10
 118       *       and 13, which are the tab, line feed and carriage return
 119       *       respectively. 128 and above the code points map to multibyte
 120       *       UTF-8 representations.
 121       *
 122       * @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
 123       *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
 124       *       LGPL license.  Notes on what changed are inside, but in general,
 125       *       the original code transformed UTF-8 text into an array of integer
 126       *       Unicode codepoints. Understandably, transforming that back to
 127       *       a string would be somewhat expensive, so the function was modded to
 128       *       directly operate on the string.  However, this discourages code
 129       *       reuse, and the logic enumerated here would be useful for any
 130       *       function that needs to be able to understand UTF-8 characters.
 131       *       As of right now, only smart lossless character encoding converters
 132       *       would need that, and I'm probably not going to implement them.
 133       */
 134      public static function cleanUTF8($str, $force_php = false)
 135      {
 136          // UTF-8 validity is checked since PHP 4.3.5
 137          // This is an optimization: if the string is already valid UTF-8, no
 138          // need to do PHP stuff. 99% of the time, this will be the case.
 139          if (preg_match(
 140              '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
 141              $str
 142          )) {
 143              return $str;
 144          }
 145  
 146          $mState = 0; // cached expected number of octets after the current octet
 147                       // until the beginning of the next UTF8 character sequence
 148          $mUcs4  = 0; // cached Unicode character
 149          $mBytes = 1; // cached expected number of octets in the current sequence
 150  
 151          // original code involved an $out that was an array of Unicode
 152          // codepoints.  Instead of having to convert back into UTF-8, we've
 153          // decided to directly append valid UTF-8 characters onto a string
 154          // $out once they're done.  $char accumulates raw bytes, while $mUcs4
 155          // turns into the Unicode code point, so there's some redundancy.
 156  
 157          $out = '';
 158          $char = '';
 159  
 160          $len = strlen($str);
 161          for ($i = 0; $i < $len; $i++) {
 162              $in = ord($str[$i]);
 163              $char .= $str[$i]; // append byte to char
 164              if (0 == $mState) {
 165                  // When mState is zero we expect either a US-ASCII character
 166                  // or a multi-octet sequence.
 167                  if (0 == (0x80 & ($in))) {
 168                      // US-ASCII, pass straight through.
 169                      if (($in <= 31 || $in == 127) &&
 170                          !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
 171                      ) {
 172                          // control characters, remove
 173                      } else {
 174                          $out .= $char;
 175                      }
 176                      // reset
 177                      $char = '';
 178                      $mBytes = 1;
 179                  } elseif (0xC0 == (0xE0 & ($in))) {
 180                      // First octet of 2 octet sequence
 181                      $mUcs4 = ($in);
 182                      $mUcs4 = ($mUcs4 & 0x1F) << 6;
 183                      $mState = 1;
 184                      $mBytes = 2;
 185                  } elseif (0xE0 == (0xF0 & ($in))) {
 186                      // First octet of 3 octet sequence
 187                      $mUcs4 = ($in);
 188                      $mUcs4 = ($mUcs4 & 0x0F) << 12;
 189                      $mState = 2;
 190                      $mBytes = 3;
 191                  } elseif (0xF0 == (0xF8 & ($in))) {
 192                      // First octet of 4 octet sequence
 193                      $mUcs4 = ($in);
 194                      $mUcs4 = ($mUcs4 & 0x07) << 18;
 195                      $mState = 3;
 196                      $mBytes = 4;
 197                  } elseif (0xF8 == (0xFC & ($in))) {
 198                      // First octet of 5 octet sequence.
 199                      //
 200                      // This is illegal because the encoded codepoint must be
 201                      // either:
 202                      // (a) not the shortest form or
 203                      // (b) outside the Unicode range of 0-0x10FFFF.
 204                      // Rather than trying to resynchronize, we will carry on
 205                      // until the end of the sequence and let the later error
 206                      // handling code catch it.
 207                      $mUcs4 = ($in);
 208                      $mUcs4 = ($mUcs4 & 0x03) << 24;
 209                      $mState = 4;
 210                      $mBytes = 5;
 211                  } elseif (0xFC == (0xFE & ($in))) {
 212                      // First octet of 6 octet sequence, see comments for 5
 213                      // octet sequence.
 214                      $mUcs4 = ($in);
 215                      $mUcs4 = ($mUcs4 & 1) << 30;
 216                      $mState = 5;
 217                      $mBytes = 6;
 218                  } else {
 219                      // Current octet is neither in the US-ASCII range nor a
 220                      // legal first octet of a multi-octet sequence.
 221                      $mState = 0;
 222                      $mUcs4  = 0;
 223                      $mBytes = 1;
 224                      $char = '';
 225                  }
 226              } else {
 227                  // When mState is non-zero, we expect a continuation of the
 228                  // multi-octet sequence
 229                  if (0x80 == (0xC0 & ($in))) {
 230                      // Legal continuation.
 231                      $shift = ($mState - 1) * 6;
 232                      $tmp = $in;
 233                      $tmp = ($tmp & 0x0000003F) << $shift;
 234                      $mUcs4 |= $tmp;
 235  
 236                      if (0 == --$mState) {
 237                          // End of the multi-octet sequence. mUcs4 now contains
 238                          // the final Unicode codepoint to be output
 239  
 240                          // Check for illegal sequences and codepoints.
 241  
 242                          // From Unicode 3.1, non-shortest form is illegal
 243                          if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 244                              ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 245                              ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 246                              (4 < $mBytes) ||
 247                              // From Unicode 3.2, surrogate characters = illegal
 248                              (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 249                              // Codepoints outside the Unicode range are illegal
 250                              ($mUcs4 > 0x10FFFF)
 251                          ) {
 252  
 253                          } elseif (0xFEFF != $mUcs4 && // omit BOM
 254                              // check for valid Char unicode codepoints
 255                              (
 256                                  0x9 == $mUcs4 ||
 257                                  0xA == $mUcs4 ||
 258                                  0xD == $mUcs4 ||
 259                                  (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
 260                                  // 7F-9F is not strictly prohibited by XML,
 261                                  // but it is non-SGML, and thus we don't allow it
 262                                  (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
 263                                  (0xE000 <= $mUcs4 && 0xFFFD >= $mUcs4) ||
 264                                  (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
 265                              )
 266                          ) {
 267                              $out .= $char;
 268                          }
 269                          // initialize UTF8 cache (reset)
 270                          $mState = 0;
 271                          $mUcs4  = 0;
 272                          $mBytes = 1;
 273                          $char = '';
 274                      }
 275                  } else {
 276                      // ((0xC0 & (*in) != 0x80) && (mState != 0))
 277                      // Incomplete multi-octet sequence.
 278                      // used to result in complete fail, but we'll reset
 279                      $mState = 0;
 280                      $mUcs4  = 0;
 281                      $mBytes = 1;
 282                      $char ='';
 283                  }
 284              }
 285          }
 286          return $out;
 287      }
 288  
 289      /**
 290       * Translates a Unicode codepoint into its corresponding UTF-8 character.
 291       * @note Based on Feyd's function at
 292       *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
 293       *       which is in public domain.
 294       * @note While we're going to do code point parsing anyway, a good
 295       *       optimization would be to refuse to translate code points that
 296       *       are non-SGML characters.  However, this could lead to duplication.
 297       * @note This is very similar to the unichr function in
 298       *       maintenance/generate-entity-file.php (although this is superior,
 299       *       due to its sanity checks).
 300       */
 301  
 302      // +----------+----------+----------+----------+
 303      // | 33222222 | 22221111 | 111111   |          |
 304      // | 10987654 | 32109876 | 54321098 | 76543210 | bit
 305      // +----------+----------+----------+----------+
 306      // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
 307      // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
 308      // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
 309      // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
 310      // +----------+----------+----------+----------+
 311      // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
 312      // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
 313      // +----------+----------+----------+----------+
 314  
 315      public static function unichr($code)
 316      {
 317          if ($code > 1114111 or $code < 0 or
 318            ($code >= 55296 and $code <= 57343) ) {
 319              // bits are set outside the "valid" range as defined
 320              // by UNICODE 4.1.0
 321              return '';
 322          }
 323  
 324          $x = $y = $z = $w = 0;
 325          if ($code < 128) {
 326              // regular ASCII character
 327              $x = $code;
 328          } else {
 329              // set up bits for UTF-8
 330              $x = ($code & 63) | 128;
 331              if ($code < 2048) {
 332                  $y = (($code & 2047) >> 6) | 192;
 333              } else {
 334                  $y = (($code & 4032) >> 6) | 128;
 335                  if ($code < 65536) {
 336                      $z = (($code >> 12) & 15) | 224;
 337                  } else {
 338                      $z = (($code >> 12) & 63) | 128;
 339                      $w = (($code >> 18) & 7)  | 240;
 340                  }
 341              }
 342          }
 343          // set up the actual character
 344          $ret = '';
 345          if ($w) {
 346              $ret .= chr($w);
 347          }
 348          if ($z) {
 349              $ret .= chr($z);
 350          }
 351          if ($y) {
 352              $ret .= chr($y);
 353          }
 354          $ret .= chr($x);
 355  
 356          return $ret;
 357      }
 358  
 359      /**
 360       * @return bool
 361       */
 362      public static function iconvAvailable()
 363      {
 364          static $iconv = null;
 365          if ($iconv === null) {
 366              $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
 367          }
 368          return $iconv;
 369      }
 370  
 371      /**
 372       * Convert a string to UTF-8 based on configuration.
 373       * @param string $str The string to convert
 374       * @param HTMLPurifier_Config $config
 375       * @param HTMLPurifier_Context $context
 376       * @return string
 377       */
 378      public static function convertToUTF8($str, $config, $context)
 379      {
 380          $encoding = $config->get('Core.Encoding');
 381          if ($encoding === 'utf-8') {
 382              return $str;
 383          }
 384          static $iconv = null;
 385          if ($iconv === null) {
 386              $iconv = self::iconvAvailable();
 387          }
 388          if ($iconv && !$config->get('Test.ForceNoIconv')) {
 389              // unaffected by bugs, since UTF-8 support all characters
 390              $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
 391              if ($str === false) {
 392                  // $encoding is not a valid encoding
 393                  trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
 394                  return '';
 395              }
 396              // If the string is bjorked by Shift_JIS or a similar encoding
 397              // that doesn't support all of ASCII, convert the naughty
 398              // characters to their true byte-wise ASCII/UTF-8 equivalents.
 399              $str = strtr($str, self::testEncodingSupportsASCII($encoding));
 400              return $str;
 401          } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
 402              $str = mb_convert_encoding($str, 'UTF-8', 'ISO-8859-1');
 403              return $str;
 404          }
 405          $bug = HTMLPurifier_Encoder::testIconvTruncateBug();
 406          if ($bug == self::ICONV_OK) {
 407              trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
 408          } else {
 409              trigger_error(
 410                  'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
 411                  'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
 412                  E_USER_ERROR
 413              );
 414          }
 415      }
 416  
 417      /**
 418       * Converts a string from UTF-8 based on configuration.
 419       * @param string $str The string to convert
 420       * @param HTMLPurifier_Config $config
 421       * @param HTMLPurifier_Context $context
 422       * @return string
 423       * @note Currently, this is a lossy conversion, with unexpressable
 424       *       characters being omitted.
 425       */
 426      public static function convertFromUTF8($str, $config, $context)
 427      {
 428          $encoding = $config->get('Core.Encoding');
 429          if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
 430              $str = self::convertToASCIIDumbLossless($str);
 431          }
 432          if ($encoding === 'utf-8') {
 433              return $str;
 434          }
 435          static $iconv = null;
 436          if ($iconv === null) {
 437              $iconv = self::iconvAvailable();
 438          }
 439          if ($iconv && !$config->get('Test.ForceNoIconv')) {
 440              // Undo our previous fix in convertToUTF8, otherwise iconv will barf
 441              $ascii_fix = self::testEncodingSupportsASCII($encoding);
 442              if (!$escape && !empty($ascii_fix)) {
 443                  $clear_fix = array();
 444                  foreach ($ascii_fix as $utf8 => $native) {
 445                      $clear_fix[$utf8] = '';
 446                  }
 447                  $str = strtr($str, $clear_fix);
 448              }
 449              $str = strtr($str, array_flip($ascii_fix));
 450              // Normal stuff
 451              $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
 452              return $str;
 453          } elseif ($encoding === 'iso-8859-1' && function_exists('mb_convert_encoding')) {
 454              $str = mb_convert_encoding($str, 'ISO-8859-1', 'UTF-8');
 455              return $str;
 456          }
 457          trigger_error('Encoding not supported', E_USER_ERROR);
 458          // You might be tempted to assume that the ASCII representation
 459          // might be OK, however, this is *not* universally true over all
 460          // encodings.  So we take the conservative route here, rather
 461          // than forcibly turn on %Core.EscapeNonASCIICharacters
 462      }
 463  
 464      /**
 465       * Lossless (character-wise) conversion of HTML to ASCII
 466       * @param string $str UTF-8 string to be converted to ASCII
 467       * @return string ASCII encoded string with non-ASCII character entity-ized
 468       * @warning Adapted from MediaWiki, claiming fair use: this is a common
 469       *       algorithm. If you disagree with this license fudgery,
 470       *       implement it yourself.
 471       * @note Uses decimal numeric entities since they are best supported.
 472       * @note This is a DUMB function: it has no concept of keeping
 473       *       character entities that the projected character encoding
 474       *       can allow. We could possibly implement a smart version
 475       *       but that would require it to also know which Unicode
 476       *       codepoints the charset supported (not an easy task).
 477       * @note Sort of with cleanUTF8() but it assumes that $str is
 478       *       well-formed UTF-8
 479       */
 480      public static function convertToASCIIDumbLossless($str)
 481      {
 482          $bytesleft = 0;
 483          $result = '';
 484          $working = 0;
 485          $len = strlen($str);
 486          for ($i = 0; $i < $len; $i++) {
 487              $bytevalue = ord($str[$i]);
 488              if ($bytevalue <= 0x7F) { //0xxx xxxx
 489                  $result .= chr($bytevalue);
 490                  $bytesleft = 0;
 491              } elseif ($bytevalue <= 0xBF) { //10xx xxxx
 492                  $working = $working << 6;
 493                  $working += ($bytevalue & 0x3F);
 494                  $bytesleft--;
 495                  if ($bytesleft <= 0) {
 496                      $result .= "&#" . $working . ";";
 497                  }
 498              } elseif ($bytevalue <= 0xDF) { //110x xxxx
 499                  $working = $bytevalue & 0x1F;
 500                  $bytesleft = 1;
 501              } elseif ($bytevalue <= 0xEF) { //1110 xxxx
 502                  $working = $bytevalue & 0x0F;
 503                  $bytesleft = 2;
 504              } else { //1111 0xxx
 505                  $working = $bytevalue & 0x07;
 506                  $bytesleft = 3;
 507              }
 508          }
 509          return $result;
 510      }
 511  
 512      /** No bugs detected in iconv. */
 513      const ICONV_OK = 0;
 514  
 515      /** Iconv truncates output if converting from UTF-8 to another
 516       *  character set with //IGNORE, and a non-encodable character is found */
 517      const ICONV_TRUNCATES = 1;
 518  
 519      /** Iconv does not support //IGNORE, making it unusable for
 520       *  transcoding purposes */
 521      const ICONV_UNUSABLE = 2;
 522  
 523      /**
 524       * glibc iconv has a known bug where it doesn't handle the magic
 525       * //IGNORE stanza correctly.  In particular, rather than ignore
 526       * characters, it will return an EILSEQ after consuming some number
 527       * of characters, and expect you to restart iconv as if it were
 528       * an E2BIG.  Old versions of PHP did not respect the errno, and
 529       * returned the fragment, so as a result you would see iconv
 530       * mysteriously truncating output. We can work around this by
 531       * manually chopping our input into segments of about 8000
 532       * characters, as long as PHP ignores the error code.  If PHP starts
 533       * paying attention to the error code, iconv becomes unusable.
 534       *
 535       * @return int Error code indicating severity of bug.
 536       */
 537      public static function testIconvTruncateBug()
 538      {
 539          static $code = null;
 540          if ($code === null) {
 541              // better not use iconv, otherwise infinite loop!
 542              $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
 543              if ($r === false) {
 544                  $code = self::ICONV_UNUSABLE;
 545              } elseif (($c = strlen($r)) < 9000) {
 546                  $code = self::ICONV_TRUNCATES;
 547              } elseif ($c > 9000) {
 548                  trigger_error(
 549                      'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
 550                      'include your iconv version as per phpversion()',
 551                      E_USER_ERROR
 552                  );
 553              } else {
 554                  $code = self::ICONV_OK;
 555              }
 556          }
 557          return $code;
 558      }
 559  
 560      /**
 561       * This expensive function tests whether or not a given character
 562       * encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
 563       * fail this test, and require special processing. Variable width
 564       * encodings shouldn't ever fail.
 565       *
 566       * @param string $encoding Encoding name to test, as per iconv format
 567       * @param bool $bypass Whether or not to bypass the precompiled arrays.
 568       * @return Array of UTF-8 characters to their corresponding ASCII,
 569       *      which can be used to "undo" any overzealous iconv action.
 570       */
 571      public static function testEncodingSupportsASCII($encoding, $bypass = false)
 572      {
 573          // All calls to iconv here are unsafe, proof by case analysis:
 574          // If ICONV_OK, no difference.
 575          // If ICONV_TRUNCATE, all calls involve one character inputs,
 576          // so bug is not triggered.
 577          // If ICONV_UNUSABLE, this call is irrelevant
 578          static $encodings = array();
 579          if (!$bypass) {
 580              if (isset($encodings[$encoding])) {
 581                  return $encodings[$encoding];
 582              }
 583              $lenc = strtolower($encoding);
 584              switch ($lenc) {
 585                  case 'shift_jis':
 586                      return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
 587                  case 'johab':
 588                      return array("\xE2\x82\xA9" => '\\');
 589              }
 590              if (strpos($lenc, 'iso-8859-') === 0) {
 591                  return array();
 592              }
 593          }
 594          $ret = array();
 595          if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) {
 596              return false;
 597          }
 598          for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
 599              $c = chr($i); // UTF-8 char
 600              $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
 601              if ($r === '' ||
 602                  // This line is needed for iconv implementations that do not
 603                  // omit characters that do not exist in the target character set
 604                  ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
 605              ) {
 606                  // Reverse engineer: what's the UTF-8 equiv of this byte
 607                  // sequence? This assumes that there's no variable width
 608                  // encoding that doesn't support ASCII.
 609                  $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
 610              }
 611          }
 612          $encodings[$encoding] = $ret;
 613          return $ret;
 614      }
 615  }
 616  
 617  // vim: et sw=4 sts=4