See Release Notes
Long Term Support Release
1 <?php 2 /*************************************************************** 3 * Copyright notice 4 * 5 * (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com) 6 * All rights reserved 7 * 8 * This script is part of the Typo3 project. The Typo3 project is 9 * free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * The GNU General Public License can be found at 15 * http://www.gnu.org/copyleft/gpl.html. 16 * 17 * This script is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * GNU General Public License for more details. 21 * 22 * This copyright notice MUST APPEAR in all copies of the script! 23 ***************************************************************/ 24 /** 25 * Class for conversion between charsets. 26 * 27 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 28 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 29 */ 30 31 32 /** 33 * Notes on UTF-8 34 * 35 * Functions working on UTF-8 strings: 36 * 37 * - strchr/strstr 38 * - strrchr 39 * - substr_count 40 * - implode/explode/join 41 * 42 * Functions nearly working on UTF-8 strings: 43 * 44 * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen 45 * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII 46 * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos 47 * - htmlentities: charset support for UTF-8 only since PHP 4.3.0 48 * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier 49 * 50 * Functions NOT working on UTF-8 strings: 51 * 52 * - str*cmp 53 * - stristr 54 * - stripos 55 * - substr 56 * - strrev 57 * - split/spliti 58 * - ... 59 * 60 */ 61 /** 62 * Class for conversion between charsets 63 * 64 * @author Kasper Skårhøj <kasperYYYY@typo3.com> 65 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 66 * @package TYPO3 67 * @subpackage t3lib 68 */ 69 class t3lib_cs { 70 71 /** 72 * @var t3lib_l10n_Locales 73 */ 74 protected $locales; 75 76 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent. 77 78 // This is the array where parsed conversion tables are stored (cached) 79 var $parsedCharsets = array(); 80 81 // An array where case folding data will be stored (cached) 82 var $caseFolding = array(); 83 84 // An array where charset-to-ASCII mappings are stored (cached) 85 var $toASCII = array(); 86 87 // This tells the converter which charsets has two bytes per char: 88 var $twoByteSets = array( 89 'ucs-2' => 1, // 2-byte Unicode 90 ); 91 92 // This tells the converter which charsets has four bytes per char: 93 var $fourByteSets = array( 94 'ucs-4' => 1, // 4-byte Unicode 95 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 96 ); 97 98 // This tells the converter which charsets use a scheme like the Extended Unix Code: 99 var $eucBasedSets = array( 100 'gb2312' => 1, // Chinese, simplified. 101 'big5' => 1, // Chinese, traditional. 102 'euc-kr' => 1, // Korean 103 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 104 ); 105 106 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 107 // http://czyborra.com/charsets/iso8859.html 108 var $synonyms = array( 109 'us' => 'ascii', 110 'us-ascii' => 'ascii', 111 'cp819' => 'iso-8859-1', 112 'ibm819' => 'iso-8859-1', 113 'iso-ir-100' => 'iso-8859-1', 114 'iso-ir-101' => 'iso-8859-2', 115 'iso-ir-109' => 'iso-8859-3', 116 'iso-ir-110' => 'iso-8859-4', 117 'iso-ir-144' => 'iso-8859-5', 118 'iso-ir-127' => 'iso-8859-6', 119 'iso-ir-126' => 'iso-8859-7', 120 'iso-ir-138' => 'iso-8859-8', 121 'iso-ir-148' => 'iso-8859-9', 122 'iso-ir-157' => 'iso-8859-10', 123 'iso-ir-179' => 'iso-8859-13', 124 'iso-ir-199' => 'iso-8859-14', 125 'iso-ir-203' => 'iso-8859-15', 126 'csisolatin1' => 'iso-8859-1', 127 'csisolatin2' => 'iso-8859-2', 128 'csisolatin3' => 'iso-8859-3', 129 'csisolatin5' => 'iso-8859-9', 130 'csisolatin8' => 'iso-8859-14', 131 'csisolatin9' => 'iso-8859-15', 132 'csisolatingreek' => 'iso-8859-7', 133 'iso-celtic' => 'iso-8859-14', 134 'latin1' => 'iso-8859-1', 135 'latin2' => 'iso-8859-2', 136 'latin3' => 'iso-8859-3', 137 'latin5' => 'iso-8859-9', 138 'latin6' => 'iso-8859-10', 139 'latin8' => 'iso-8859-14', 140 'latin9' => 'iso-8859-15', 141 'l1' => 'iso-8859-1', 142 'l2' => 'iso-8859-2', 143 'l3' => 'iso-8859-3', 144 'l5' => 'iso-8859-9', 145 'l6' => 'iso-8859-10', 146 'l8' => 'iso-8859-14', 147 'l9' => 'iso-8859-15', 148 'cyrillic' => 'iso-8859-5', 149 'arabic' => 'iso-8859-6', 150 'tis-620' => 'iso-8859-11', 151 'win874' => 'windows-874', 152 'win1250' => 'windows-1250', 153 'win1251' => 'windows-1251', 154 'win1252' => 'windows-1252', 155 'win1253' => 'windows-1253', 156 'win1254' => 'windows-1254', 157 'win1255' => 'windows-1255', 158 'win1256' => 'windows-1256', 159 'win1257' => 'windows-1257', 160 'win1258' => 'windows-1258', 161 'cp1250' => 'windows-1250', 162 'cp1251' => 'windows-1251', 163 'cp1252' => 'windows-1252', 164 'ms-ee' => 'windows-1250', 165 'ms-ansi' => 'windows-1252', 166 'ms-greek' => 'windows-1253', 167 'ms-turk' => 'windows-1254', 168 'winbaltrim' => 'windows-1257', 169 'koi-8ru' => 'koi-8r', 170 'koi8r' => 'koi-8r', 171 'cp878' => 'koi-8r', 172 'mac' => 'macroman', 173 'macintosh' => 'macroman', 174 'euc-cn' => 'gb2312', 175 'x-euc-cn' => 'gb2312', 176 'euccn' => 'gb2312', 177 'cp936' => 'gb2312', 178 'big-5' => 'big5', 179 'cp950' => 'big5', 180 'eucjp' => 'euc-jp', 181 'sjis' => 'shift_jis', 182 'shift-jis' => 'shift_jis', 183 'cp932' => 'shift_jis', 184 'cp949' => 'euc-kr', 185 'utf7' => 'utf-7', 186 'utf8' => 'utf-8', 187 'utf16' => 'utf-16', 188 'utf32' => 'utf-32', 189 'utf8' => 'utf-8', 190 'ucs2' => 'ucs-2', 191 'ucs4' => 'ucs-4', 192 ); 193 194 // mapping of iso-639-1 language codes to script names 195 var $lang_to_script = array( 196 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php 197 'af' => 'west_european', //Afrikaans 198 'ar' => 'arabic', 199 'bg' => 'cyrillic', // Bulgarian 200 'bs' => 'east_european', // Bosnian 201 'cs' => 'east_european', // Czech 202 'da' => 'west_european', // Danish 203 'de' => 'west_european', // German 204 'es' => 'west_european', // Spanish 205 'et' => 'estonian', 206 'eo' => 'unicode', // Esperanto 207 'eu' => 'west_european', // Basque 208 'fa' => 'arabic', // Persian 209 'fi' => 'west_european', // Finish 210 'fo' => 'west_european', // Faroese 211 'fr' => 'west_european', // French 212 'ga' => 'west_european', // Irish 213 'gl' => 'west_european', // Galician 214 'gr' => 'greek', 215 'he' => 'hebrew', // Hebrew (since 1998) 216 'hi' => 'unicode', // Hindi 217 'hr' => 'east_european', // Croatian 218 'hu' => 'east_european', // Hungarian 219 'iw' => 'hebrew', // Hebrew (til 1998) 220 'is' => 'west_european', // Icelandic 221 'it' => 'west_european', // Italian 222 'ja' => 'japanese', 223 'ka' => 'unicode', // Georgian 224 'kl' => 'west_european', // Greenlandic 225 'km' => 'unicode', // Khmer 226 'ko' => 'korean', 227 'lt' => 'lithuanian', 228 'lv' => 'west_european', // Latvian/Lettish 229 'nl' => 'west_european', // Dutch 230 'no' => 'west_european', // Norwegian 231 'nb' => 'west_european', // Norwegian Bokmal 232 'nn' => 'west_european', // Norwegian Nynorsk 233 'pl' => 'east_european', // Polish 234 'pt' => 'west_european', // Portuguese 235 'ro' => 'east_european', // Romanian 236 'ru' => 'cyrillic', // Russian 237 'sk' => 'east_european', // Slovak 238 'sl' => 'east_european', // Slovenian 239 'sr' => 'cyrillic', // Serbian 240 'sv' => 'west_european', // Swedish 241 'sq' => 'albanian', // Albanian 242 'th' => 'thai', 243 'uk' => 'cyrillic', // Ukranian 244 'vi' => 'vietnamese', 245 'zh' => 'chinese', 246 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 247 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp 248 'afk'=> 'west_european', // Afrikaans 249 'ara' => 'arabic', 250 'bgr' => 'cyrillic', // Bulgarian 251 'cat' => 'west_european', // Catalan 252 'chs' => 'simpl_chinese', 253 'cht' => 'trad_chinese', 254 'csy' => 'east_european', // Czech 255 'dan' => 'west_european', // Danisch 256 'deu' => 'west_european', // German 257 'dea' => 'west_european', // German (Austrian) 258 'des' => 'west_european', // German (Swiss) 259 'ena' => 'west_european', // English (Australian) 260 'enc' => 'west_european', // English (Canadian) 261 'eng' => 'west_european', // English 262 'enz' => 'west_european', // English (New Zealand) 263 'enu' => 'west_european', // English (United States) 264 'euq' => 'west_european', // Basque 265 'fos' => 'west_european', // Faroese 266 'far' => 'arabic', // Persian 267 'fin' => 'west_european', // Finish 268 'fra' => 'west_european', // French 269 'frb' => 'west_european', // French (Belgian) 270 'frc' => 'west_european', // French (Canadian) 271 'frs' => 'west_european', // French (Swiss) 272 'geo' => 'unicode', // Georgian 273 'glg' => 'west_european', // Galician 274 'ell' => 'greek', 275 'heb' => 'hebrew', 276 'hin' => 'unicode', // Hindi 277 'hun' => 'east_european', // Hungarian 278 'isl' => 'west_euorpean', // Icelandic 279 'ita' => 'west_european', // Italian 280 'its' => 'west_european', // Italian (Swiss) 281 'jpn' => 'japanese', 282 'khm' => 'unicode', // Khmer 283 'kor' => 'korean', 284 'lth' => 'lithuanian', 285 'lvi' => 'west_european', // Latvian/Lettish 286 'msl' => 'west_european', // Malay 287 'nlb' => 'west_european', // Dutch (Belgian) 288 'nld' => 'west_european', // Dutch 289 'nor' => 'west_european', // Norwegian (bokmal) 290 'non' => 'west_european', // Norwegian (nynorsk) 291 'plk' => 'east_european', // Polish 292 'ptg' => 'west_european', // Portuguese 293 'ptb' => 'west_european', // Portuguese (Brazil) 294 'rom' => 'east_european', // Romanian 295 'rus' => 'cyrillic', // Russian 296 'slv' => 'east_european', // Slovenian 297 'sky' => 'east_european', // Slovak 298 'srl' => 'east_european', // Serbian (Latin) 299 'srb' => 'cyrillic', // Serbian (Cyrillic) 300 'esp' => 'west_european', // Spanish (trad. sort) 301 'esm' => 'west_european', // Spanish (Mexican) 302 'esn' => 'west_european', // Spanish (internat. sort) 303 'sve' => 'west_european', // Swedish 304 'sqi' => 'albanian', // Albanian 305 'tha' => 'thai', 306 'trk' => 'turkish', 307 'ukr' => 'cyrillic', // Ukrainian 308 // English language names 309 'afrikaans' => 'west_european', 310 'albanian' => 'albanian', 311 'arabic' => 'arabic', 312 'basque' => 'west_european', 313 'bosnian' => 'east_european', 314 'bulgarian' => 'east_european', 315 'catalan' => 'west_european', 316 'croatian' => 'east_european', 317 'czech' => 'east_european', 318 'danish' => 'west_european', 319 'dutch' => 'west_european', 320 'english' => 'west_european', 321 'esperanto' => 'unicode', 322 'estonian' => 'estonian', 323 'faroese' => 'west_european', 324 'farsi' => 'arabic', 325 'finnish' => 'west_european', 326 'french' => 'west_european', 327 'galician' => 'west_european', 328 'georgian' => 'unicode', 329 'german' => 'west_european', 330 'greek' => 'greek', 331 'greenlandic' => 'west_european', 332 'hebrew' => 'hebrew', 333 'hindi' => 'unicode', 334 'hungarian' => 'east_european', 335 'icelandic' => 'west_european', 336 'italian' => 'west_european', 337 'khmer' => 'unicode', 338 'latvian' => 'west_european', 339 'lettish' => 'west_european', 340 'lithuanian' => 'lithuanian', 341 'malay' => 'west_european', 342 'norwegian' => 'west_european', 343 'persian' => 'arabic', 344 'polish' => 'east_european', 345 'portuguese' => 'west_european', 346 'russian' => 'cyrillic', 347 'romanian' => 'east_european', 348 'serbian' => 'cyrillic', 349 'slovak' => 'east_european', 350 'slovenian' => 'east_european', 351 'spanish' => 'west_european', 352 'svedish' => 'west_european', 353 'that' => 'thai', 354 'turkish' => 'turkish', 355 'ukrainian' => 'cyrillic', 356 ); 357 358 // mapping of language (family) names to charsets on Unix 359 var $script_to_charset_unix = array( 360 'west_european' => 'iso-8859-1', 361 'estonian' => 'iso-8859-1', 362 'east_european' => 'iso-8859-2', 363 'baltic' => 'iso-8859-4', 364 'cyrillic' => 'iso-8859-5', 365 'arabic' => 'iso-8859-6', 366 'greek' => 'iso-8859-7', 367 'hebrew' => 'iso-8859-8', 368 'turkish' => 'iso-8859-9', 369 'thai' => 'iso-8859-11', // = TIS-620 370 'lithuanian' => 'iso-8859-13', 371 'chinese' => 'gb2312', // = euc-cn 372 'japanese' => 'euc-jp', 373 'korean' => 'euc-kr', 374 'simpl_chinese' => 'gb2312', 375 'trad_chinese' => 'big5', 376 'vietnamese' => '', 377 'unicode' => 'utf-8', 378 'albanian' => 'utf-8' 379 ); 380 381 // mapping of language (family) names to charsets on Windows 382 var $script_to_charset_windows = array( 383 'east_european' => 'windows-1250', 384 'cyrillic' => 'windows-1251', 385 'west_european' => 'windows-1252', 386 'greek' => 'windows-1253', 387 'turkish' => 'windows-1254', 388 'hebrew' => 'windows-1255', 389 'arabic' => 'windows-1256', 390 'baltic' => 'windows-1257', 391 'estonian' => 'windows-1257', 392 'lithuanian' => 'windows-1257', 393 'vietnamese' => 'windows-1258', 394 'thai' => 'cp874', 395 'korean' => 'cp949', 396 'chinese' => 'gb2312', 397 'japanese' => 'shift_jis', 398 'simpl_chinese' => 'gb2312', 399 'trad_chinese' => 'big5', 400 'albanian' => 'windows-1250', 401 'unicode' => 'utf-8' 402 ); 403 404 // mapping of locale names to charsets 405 var $locale_to_charset = array( 406 'japanese.euc' => 'euc-jp', 407 'ja_jp.ujis' => 'euc-jp', 408 'korean.euc' => 'euc-kr', 409 'sr@Latn' => 'iso-8859-2', 410 'zh_cn' => 'gb2312', 411 'zh_hk' => 'big5', 412 'zh_tw' => 'big5', 413 ); 414 415 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 416 // Empty values means "iso-8859-1" 417 var $charSetArray = array( 418 'af' => '', 419 'ar' => 'iso-8859-6', 420 'ba' => 'iso-8859-2', 421 'bg' => 'windows-1251', 422 'br' => '', 423 'ca' => 'iso-8859-15', 424 'ch' => 'gb2312', 425 'cs' => 'windows-1250', 426 'cz' => 'windows-1250', 427 'da' => '', 428 'de' => '', 429 'dk' => '', 430 'el' => 'iso-8859-7', 431 'eo' => 'utf-8', 432 'es' => '', 433 'et' => 'iso-8859-4', 434 'eu' => '', 435 'fa' => 'utf-8', 436 'fi' => '', 437 'fo' => 'utf-8', 438 'fr' => '', 439 'fr_CA' => '', 440 'ga' => '', 441 'ge' => 'utf-8', 442 'gl' => '', 443 'gr' => 'iso-8859-7', 444 'he' => 'utf-8', 445 'hi' => 'utf-8', 446 'hk' => 'big5', 447 'hr' => 'windows-1250', 448 'hu' => 'iso-8859-2', 449 'is' => 'utf-8', 450 'it' => '', 451 'ja' => 'shift_jis', 452 'jp' => 'shift_jis', 453 'ka' => 'utf-8', 454 'kl' => 'utf-8', 455 'km' => 'utf-8', 456 'ko' => 'euc-kr', 457 'kr' => 'euc-kr', 458 'lt' => 'windows-1257', 459 'lv' => 'utf-8', 460 'ms' => '', 461 'my' => '', 462 'nl' => '', 463 'no' => '', 464 'pl' => 'iso-8859-2', 465 'pt' => '', 466 'pt_BR' => '', 467 'qc' => '', 468 'ro' => 'iso-8859-2', 469 'ru' => 'windows-1251', 470 'se' => '', 471 'si' => 'windows-1250', 472 'sk' => 'windows-1250', 473 'sl' => 'windows-1250', 474 'sq' => 'utf-8', 475 'sr' => 'utf-8', 476 'sv' => '', 477 'th' => 'iso-8859-11', 478 'tr' => 'iso-8859-9', 479 'ua' => 'windows-1251', 480 'uk' => 'windows-1251', 481 'vi' => 'utf-8', 482 'vn' => 'utf-8', 483 'zh' => 'big5', 484 ); 485 486 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 487 // Missing keys means: same as TYPO3 488 // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping() 489 var $isoArray = array( 490 'ba' => 'bs', 491 'br' => 'pt_BR', 492 'ch' => 'zh_CN', 493 'cz' => 'cs', 494 'dk' => 'da', 495 'si' => 'sl', 496 'se' => 'sv', 497 'gl' => 'kl', 498 'gr' => 'el', 499 'hk' => 'zh_HK', 500 'kr' => 'ko', 501 'ua' => 'uk', 502 'jp' => 'ja', 503 'qc' => 'fr_CA', 504 'vn' => 'vi', 505 'ge' => 'ka', 506 'ga' => 'gl', 507 ); 508 509 /** 510 * Default constructor. 511 */ 512 public function __construct() { 513 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales'); 514 } 515 516 /** 517 * Normalize - changes input character set to lowercase letters. 518 * 519 * @param string Input charset 520 * @return string Normalized charset 521 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 522 */ 523 function parse_charset($charset) { 524 $charset = trim(strtolower($charset)); 525 if (isset($this->synonyms[$charset])) { 526 $charset = $this->synonyms[$charset]; 527 } 528 529 return $charset; 530 } 531 532 /** 533 * Get the charset of a locale. 534 * 535 * ln language 536 * ln_CN language / country 537 * ln_CN.cs language / country / charset 538 * ln_CN.cs@mod language / country / charset / modifier 539 * 540 * @param string Locale string 541 * @return string Charset resolved for locale string 542 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 543 */ 544 function get_locale_charset($locale) { 545 $locale = strtolower($locale); 546 547 // exact locale specific charset? 548 if (isset($this->locale_to_charset[$locale])) { 549 return $this->locale_to_charset[$locale]; 550 } 551 552 // get modifier 553 list($locale, $modifier) = explode('@', $locale); 554 555 // locale contains charset: use it 556 list($locale, $charset) = explode('.', $locale); 557 if ($charset) { 558 return $this->parse_charset($charset); 559 } 560 561 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 562 if ($modifier == 'euro') { 563 return 'iso-8859-15'; 564 } 565 566 // get language 567 list($language, $country) = explode('_', $locale); 568 if (isset($this->lang_to_script[$language])) { 569 $script = $this->lang_to_script[$language]; 570 } 571 572 if (TYPO3_OS == 'WIN') { 573 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252'; 574 } else { 575 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8'; 576 } 577 578 return $cs; 579 } 580 581 582 /******************************************** 583 * 584 * Charset Conversion functions 585 * 586 ********************************************/ 587 588 /** 589 * Convert from one charset to another charset. 590 * 591 * @param string Input string 592 * @param string From charset (the current charset of the string) 593 * @param string To charset (the output charset wanted) 594 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 595 * @return string Converted string 596 * @see convArray() 597 */ 598 function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) { 599 if ($fromCS == $toCS) { 600 return $str; 601 } 602 603 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 604 if ($toCS == 'utf-8' || !$useEntityForNoChar) { 605 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 606 case 'mbstring': 607 $conv_str = mb_convert_encoding($str, $toCS, $fromCS); 608 if (FALSE !== $conv_str) { 609 return $conv_str; 610 } // returns FALSE for unsupported charsets 611 break; 612 613 case 'iconv': 614 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str); 615 if (FALSE !== $conv_str) { 616 return $conv_str; 617 } 618 break; 619 620 case 'recode': 621 $conv_str = recode_string($fromCS . '..' . $toCS, $str); 622 if (FALSE !== $conv_str) { 623 return $conv_str; 624 } 625 break; 626 } 627 // fallback to TYPO3 conversion 628 } 629 630 if ($fromCS != 'utf-8') { 631 $str = $this->utf8_encode($str, $fromCS); 632 } 633 if ($toCS != 'utf-8') { 634 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar); 635 } 636 return $str; 637 } 638 639 /** 640 * Convert all elements in ARRAY with type string from one charset to another charset. 641 * NOTICE: Array is passed by reference! 642 * 643 * @param string Input array, possibly multidimensional 644 * @param string From charset (the current charset of the string) 645 * @param string To charset (the output charset wanted) 646 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 647 * @return void 648 * @see conv() 649 */ 650 function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) { 651 foreach ($array as $key => $value) { 652 if (is_array($array[$key])) { 653 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar); 654 } elseif (is_string($array[$key])) { 655 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar); 656 } 657 } 658 } 659 660 /** 661 * Converts $str from $charset to UTF-8 662 * 663 * @param string String in local charset to convert to UTF-8 664 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 665 * @return string Output string, converted to UTF-8 666 */ 667 function utf8_encode($str, $charset) { 668 669 if ($charset === 'utf-8') { 670 return $str; 671 } 672 673 // Charset is case-insensitive. 674 if ($this->initCharset($charset)) { // Parse conv. table if not already... 675 $strLen = strlen($str); 676 $outStr = ''; 677 678 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string. 679 $chr = substr($str, $a, 1); 680 $ord = ord($chr); 681 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 682 $ord2 = ord($str[$a + 1]); 683 $ord = $ord << 8 | $ord2; // assume big endian 684 685 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 686 $outStr .= $this->parsedCharsets[$charset]['local'][$ord]; 687 } else { 688 $outStr .= chr($this->noCharByteVal); 689 } // No char exists 690 $a++; 691 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8 692 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 693 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 694 $a++; 695 $ord2 = ord(substr($str, $a, 1)); 696 $ord = $ord * 256 + $ord2; 697 } 698 } 699 700 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 701 $outStr .= $this->parsedCharsets[$charset]['local'][$ord]; 702 } else { 703 $outStr .= chr($this->noCharByteVal); 704 } // No char exists 705 } else { 706 $outStr .= $chr; 707 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 708 } 709 return $outStr; 710 } 711 } 712 713 /** 714 * Converts $str from UTF-8 to $charset 715 * 716 * @param string String in UTF-8 to convert to local charset 717 * @param string Charset, lowercase. Must be found in csconvtbl/ folder. 718 * @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities 719 * @return string Output string, converted to local charset 720 */ 721 function utf8_decode($str, $charset, $useEntityForNoChar = 0) { 722 723 if ($charset === 'utf-8') { 724 return $str; 725 } 726 727 // Charset is case-insensitive. 728 if ($this->initCharset($charset)) { // Parse conv. table if not already... 729 $strLen = strlen($str); 730 $outStr = ''; 731 $buf = ''; 732 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string. 733 $chr = substr($str, $a, 1); 734 $ord = ord($chr); 735 if ($ord > 127) { // This means multibyte! (first byte!) 736 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 737 738 $buf = $chr; // Add first byte 739 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 740 $ord = $ord << 1; // Shift it left and ... 741 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 742 $a++; // Increase pointer... 743 $buf .= substr($str, $a, 1); // ... and add the next char. 744 } else { 745 break; 746 } 747 } 748 749 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 750 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 751 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 752 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255); 753 } else { 754 $outStr .= chr($mByte); 755 } 756 } elseif ($useEntityForNoChar) { // Create num entity: 757 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';'; 758 } else { 759 $outStr .= chr($this->noCharByteVal); 760 } // No char exists 761 } else { 762 $outStr .= chr($this->noCharByteVal); 763 } // No char exists (MIDDLE of MB sequence!) 764 } else { 765 $outStr .= $chr; 766 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 767 } 768 return $outStr; 769 } 770 } 771 772 /** 773 * Converts all chars > 127 to numeric entities. 774 * 775 * @param string Input string 776 * @return string Output string 777 */ 778 function utf8_to_entities($str) { 779 $strLen = strlen($str); 780 $outStr = ''; 781 $buf = ''; 782 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string. 783 $chr = substr($str, $a, 1); 784 $ord = ord($chr); 785 if ($ord > 127) { // This means multibyte! (first byte!) 786 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 787 $buf = $chr; // Add first byte 788 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 789 $ord = $ord << 1; // Shift it left and ... 790 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 791 $a++; // Increase pointer... 792 $buf .= substr($str, $a, 1); // ... and add the next char. 793 } else { 794 break; 795 } 796 } 797 798 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';'; 799 } else { 800 $outStr .= chr($this->noCharByteVal); 801 } // No char exists (MIDDLE of MB sequence!) 802 } else { 803 $outStr .= $chr; 804 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 805 } 806 807 return $outStr; 808 } 809 810 /** 811 * Converts numeric entities (UNICODE, eg. decimal (Ӓ) or hexadecimal ()) to UTF-8 multibyte chars 812 * 813 * @param string Input string, UTF-8 814 * @param boolean If set, then all string-HTML entities (like & or £ will be converted as well) 815 * @return string Output string 816 */ 817 function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) { 818 // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later 819 // see http://php.net/manual/en/function.get-html-translation-table.php 820 $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<'); 821 822 if ($alsoStdHtmlEnt) { 823 if ($applyPhpCompatibilityFix === TRUE) { 824 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT)); 825 } else { 826 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8')); 827 } 828 } 829 830 $token = md5(microtime()); 831 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '$2}' . $token, $str)); 832 foreach ($parts as $k => $v) { 833 // only take every second element 834 if ($k % 2 === 0) { 835 continue; 836 } 837 838 $position = 0; 839 if (substr($v, $position, 1) == '#') { // Dec or hex entities: 840 $position++; 841 if (substr($v, $position, 1) == 'x') { 842 $v = hexdec(substr($v, ++$position)); 843 } else { 844 $v = substr($v, $position); 845 } 846 $parts[$k] = $this->UnumberToChar($v); 847 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities: 848 $v = $trans_tbl['&' . $v . ';']; 849 if ($applyPhpCompatibilityFix === TRUE) { 850 $v = $this->utf8_encode($v, 'iso-8859-1'); 851 } 852 $parts[$k] = $v; 853 } else { // No conversion: 854 $parts[$k] = '&' . $v . ';'; 855 } 856 } 857 858 return implode('', $parts); 859 } 860 861 /** 862 * Converts all chars in the input UTF-8 string into integer numbers returned in an array 863 * 864 * @param string Input string, UTF-8 865 * @param boolean If set, then all HTML entities (like & or £ or { or 㽝) will be detected as characters. 866 * @param boolean If set, then instead of integer numbers the real UTF-8 char is returned. 867 * @return array Output array with the char numbers 868 */ 869 function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) { 870 // If entities must be registered as well...: 871 if ($convEntities) { 872 $str = $this->entities_to_utf8($str, 1); 873 } 874 // Do conversion: 875 $strLen = strlen($str); 876 $outArr = array(); 877 $buf = ''; 878 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string. 879 $chr = substr($str, $a, 1); 880 $ord = ord($chr); 881 if ($ord > 127) { // This means multibyte! (first byte!) 882 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 883 $buf = $chr; // Add first byte 884 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 885 $ord = $ord << 1; // Shift it left and ... 886 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 887 $a++; // Increase pointer... 888 $buf .= substr($str, $a, 1); // ... and add the next char. 889 } else { 890 break; 891 } 892 } 893 894 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf); 895 } else { 896 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal; 897 } // No char exists (MIDDLE of MB sequence!) 898 } else { 899 $outArr[] = $retChar ? chr($ord) : $ord; 900 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent 901 } 902 903 return $outArr; 904 } 905 906 /** 907 * Converts a UNICODE number to a UTF-8 multibyte character 908 * Algorithm based on script found at From: http://czyborra.com/utf/ 909 * Unit-tested by Kasper 910 * 911 * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence: 912 * 913 * bytes | bits | representation 914 * 1 | 7 | 0vvvvvvv 915 * 2 | 11 | 110vvvvv 10vvvvvv 916 * 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv 917 * 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv 918 * 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 919 * 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 920 * 921 * @param integer UNICODE integer 922 * @return string UTF-8 multibyte character string 923 * @see utf8CharToUnumber() 924 */ 925 function UnumberToChar($cbyte) { 926 $str = ''; 927 928 if ($cbyte < 0x80) { 929 $str .= chr($cbyte); 930 } else { 931 if ($cbyte < 0x800) { 932 $str .= chr(0xC0 | ($cbyte >> 6)); 933 $str .= chr(0x80 | ($cbyte & 0x3F)); 934 } else { 935 if ($cbyte < 0x10000) { 936 $str .= chr(0xE0 | ($cbyte >> 12)); 937 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 938 $str .= chr(0x80 | ($cbyte & 0x3F)); 939 } else { 940 if ($cbyte < 0x200000) { 941 $str .= chr(0xF0 | ($cbyte >> 18)); 942 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 943 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 944 $str .= chr(0x80 | ($cbyte & 0x3F)); 945 } else { 946 if ($cbyte < 0x4000000) { 947 $str .= chr(0xF8 | ($cbyte >> 24)); 948 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F)); 949 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 950 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 951 $str .= chr(0x80 | ($cbyte & 0x3F)); 952 } else { 953 if ($cbyte < 0x80000000) { 954 $str .= chr(0xFC | ($cbyte >> 30)); 955 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F)); 956 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F)); 957 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F)); 958 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F)); 959 $str .= chr(0x80 | ($cbyte & 0x3F)); 960 } else { // Cannot express a 32-bit character in UTF-8 961 $str .= chr($this->noCharByteVal); 962 } 963 } 964 } 965 } 966 } 967 } 968 return $str; 969 } 970 971 /** 972 * Converts a UTF-8 Multibyte character to a UNICODE number 973 * Unit-tested by Kasper 974 * 975 * @param string UTF-8 multibyte character string 976 * @param boolean If set, then a hex. number is returned. 977 * @return integer UNICODE integer 978 * @see UnumberToChar() 979 */ 980 function utf8CharToUnumber($str, $hex = 0) { 981 $ord = ord(substr($str, 0, 1)); // First char 982 983 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 984 $binBuf = ''; 985 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string... 986 $ord = $ord << 1; // Shift it left and ... 987 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 988 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6); 989 } else { 990 break; 991 } 992 } 993 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf; 994 995 $int = bindec($binBuf); 996 } else { 997 $int = $ord; 998 } 999 1000 return $hex ? 'x' . dechex($int) : $int; 1001 } 1002 1003 1004 /******************************************** 1005 * 1006 * Init functions 1007 * 1008 ********************************************/ 1009 1010 /** 1011 * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder 1012 * This function is automatically called by the conversion functions 1013 * 1014 * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/ 1015 * 1016 * @param string The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl) 1017 * @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed. 1018 * @access private 1019 */ 1020 function initCharset($charset) { 1021 // Only process if the charset is not yet loaded: 1022 if (!is_array($this->parsedCharsets[$charset])) { 1023 1024 // Conversion table filename: 1025 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl'; 1026 1027 // If the conversion table is found: 1028 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 1029 // Cache file for charsets: 1030 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 1031 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl'); 1032 if ($cacheFile && @is_file($cacheFile)) { 1033 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1034 } else { 1035 // Parse conversion table into lines: 1036 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1); 1037 // Initialize the internal variable holding the conv. table: 1038 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array()); 1039 // traverse the lines: 1040 $detectedType = ''; 1041 foreach ($lines as $value) { 1042 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored. 1043 1044 // Detect type if not done yet: (Done on first real line) 1045 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 1046 if (!$detectedType) { 1047 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token'; 1048 } 1049 1050 if ($detectedType == 'ms-token') { 1051 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3); 1052 } elseif ($detectedType == 'whitespaced') { 1053 $regA = array(); 1054 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA); 1055 $hexbyte = $regA[1]; 1056 $utf8 = 'U+' . $regA[2]; 1057 } 1058 $decval = hexdec(trim($hexbyte)); 1059 if ($decval > 127) { 1060 $utf8decval = hexdec(substr(trim($utf8), 2)); 1061 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval); 1062 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval; 1063 } 1064 } 1065 } 1066 if ($cacheFile) { 1067 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset])); 1068 } 1069 } 1070 return 2; 1071 } else { 1072 return FALSE; 1073 } 1074 } else { 1075 return 1; 1076 } 1077 } 1078 1079 /** 1080 * This function initializes all UTF-8 character data tables. 1081 * 1082 * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/ 1083 * 1084 * @param string Mode ("case", "ascii", ...) 1085 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1086 * @access private 1087 */ 1088 function initUnicodeData($mode = NULL) { 1089 // cache files 1090 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 1091 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 1092 1093 // Only process if the tables are not yet loaded 1094 switch ($mode) { 1095 case 'case': 1096 if (is_array($this->caseFolding['utf-8'])) { 1097 return 1; 1098 } 1099 1100 // Use cached version if possible 1101 if ($cacheFileCase && @is_file($cacheFileCase)) { 1102 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 1103 return 2; 1104 } 1105 break; 1106 1107 case 'ascii': 1108 if (is_array($this->toASCII['utf-8'])) { 1109 return 1; 1110 } 1111 1112 // Use cached version if possible 1113 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 1114 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 1115 return 2; 1116 } 1117 break; 1118 } 1119 1120 // process main Unicode data file 1121 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt'; 1122 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) { 1123 return FALSE; 1124 } 1125 1126 $fh = fopen($unicodeDataFile, 'rb'); 1127 if (!$fh) { 1128 return FALSE; 1129 } 1130 1131 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 1132 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 1133 $this->caseFolding['utf-8'] = array(); 1134 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 1135 $utf8CaseFolding['toUpper'] = array(); 1136 $utf8CaseFolding['toLower'] = array(); 1137 $utf8CaseFolding['toTitle'] = array(); 1138 1139 $decomposition = array(); // array of temp. decompositions 1140 $mark = array(); // array of chars that are marks (eg. composing accents) 1141 $number = array(); // array of chars that are numbers (eg. digits) 1142 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 1143 1144 while (!feof($fh)) { 1145 $line = fgets($fh, 4096); 1146 // has a lot of info 1147 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line)); 1148 1149 $ord = hexdec($char); 1150 if ($ord > 0xFFFF) { 1151 break; 1152 } // only process the BMP 1153 1154 $utf8_char = $this->UnumberToChar($ord); 1155 1156 if ($upper) { 1157 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 1158 } 1159 if ($lower) { 1160 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 1161 } 1162 // store "title" only when different from "upper" (only a few) 1163 if ($title && $title != $upper) { 1164 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 1165 } 1166 1167 switch ($cat[0]) { 1168 case 'M': // mark (accent, umlaut, ...) 1169 $mark["U+$char"] = 1; 1170 break; 1171 1172 case 'N': // numeric value 1173 if ($ord > 0x80 && $num != '') { 1174 $number["U+$char"] = $num; 1175 } 1176 } 1177 1178 // accented Latin letters without "official" decomposition 1179 $match = array(); 1180 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) { 1181 $c = ord($match[2]); 1182 if ($match[1] == 'SMALL') { 1183 $c += 32; 1184 } 1185 1186 $decomposition["U+$char"] = array(dechex($c)); 1187 continue; 1188 } 1189 1190 $match = array(); 1191 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) { 1192 switch ($match[1]) { 1193 case '<circle>': // add parenthesis as circle replacement, eg (1) 1194 $match[2] = '0028 ' . $match[2] . ' 0029'; 1195 break; 1196 1197 case '<square>': // add square brackets as square replacement, eg [1] 1198 $match[2] = '005B ' . $match[2] . ' 005D'; 1199 break; 1200 1201 case '<compat>': // ignore multi char decompositions that start with a space 1202 if (preg_match('/^0020 /', $match[2])) { 1203 continue 2; 1204 } 1205 break; 1206 1207 // ignore Arabic and vertical layout presentation decomposition 1208 case '<initial>': 1209 case '<medial>': 1210 case '<final>': 1211 case '<isolated>': 1212 case '<vertical>': 1213 continue 2; 1214 } 1215 $decomposition["U+$char"] = explode(' ', $match[2]); 1216 } 1217 } 1218 fclose($fh); 1219 1220 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 1221 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt'; 1222 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 1223 $fh = fopen($specialCasingFile, 'rb'); 1224 if ($fh) { 1225 while (!feof($fh)) { 1226 $line = fgets($fh, 4096); 1227 if ($line[0] != '#' && trim($line) != '') { 1228 1229 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line); 1230 if ($cond == '' || $cond[0] == '#') { 1231 $utf8_char = $this->UnumberToChar(hexdec($char)); 1232 if ($char != $lower) { 1233 $arr = explode(' ', $lower); 1234 for ($i = 0; isset($arr[$i]); $i++) { 1235 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1236 } 1237 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr); 1238 } 1239 if ($char != $title && $title != $upper) { 1240 $arr = explode(' ', $title); 1241 for ($i = 0; isset($arr[$i]); $i++) { 1242 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1243 } 1244 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr); 1245 } 1246 if ($char != $upper) { 1247 $arr = explode(' ', $upper); 1248 for ($i = 0; isset($arr[$i]); $i++) { 1249 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 1250 } 1251 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr); 1252 } 1253 } 1254 } 1255 } 1256 fclose($fh); 1257 } 1258 } 1259 1260 // process custom decompositions 1261 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt'; 1262 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 1263 $fh = fopen($customTranslitFile, 'rb'); 1264 if ($fh) { 1265 while (!feof($fh)) { 1266 $line = fgets($fh, 4096); 1267 if ($line[0] != '#' && trim($line) != '') { 1268 list($char, $translit) = t3lib_div::trimExplode(';', $line); 1269 if (!$translit) { 1270 $omit["U+$char"] = 1; 1271 } 1272 $decomposition["U+$char"] = explode(' ', $translit); 1273 1274 } 1275 } 1276 fclose($fh); 1277 } 1278 } 1279 1280 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 1281 foreach ($decomposition as $from => $to) { 1282 $code_decomp = array(); 1283 1284 while ($code_value = array_shift($to)) { 1285 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 1286 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) { 1287 array_unshift($to, $cv); 1288 } 1289 } elseif (!isset($mark["U+$code_value"])) { // remove mark 1290 array_push($code_decomp, $code_value); 1291 } 1292 } 1293 if (count($code_decomp) || isset($omit[$from])) { 1294 $decomposition[$from] = $code_decomp; 1295 } else { 1296 unset($decomposition[$from]); 1297 } 1298 } 1299 1300 // create ascii only mapping 1301 $this->toASCII['utf-8'] = array(); 1302 $ascii =& $this->toASCII['utf-8']; 1303 1304 foreach ($decomposition as $from => $to) { 1305 $code_decomp = array(); 1306 while ($code_value = array_shift($to)) { 1307 $ord = hexdec($code_value); 1308 if ($ord > 127) { 1309 continue 2; 1310 } // skip decompositions containing non-ASCII chars 1311 else 1312 { 1313 array_push($code_decomp, chr($ord)); 1314 } 1315 } 1316 $ascii[$this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)))] = join('', $code_decomp); 1317 } 1318 1319 // add numeric decompositions 1320 foreach ($number as $from => $to) { 1321 $utf8_char = $this->UnumberToChar(hexdec(str_replace('U+', '0x', $from))); 1322 if (!isset($ascii[$utf8_char])) { 1323 $ascii[$utf8_char] = $to; 1324 } 1325 } 1326 1327 if ($cacheFileCase) { 1328 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding)); 1329 } 1330 1331 if ($cacheFileASCII) { 1332 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii)); 1333 } 1334 1335 return 3; 1336 } 1337 1338 /** 1339 * This function initializes the folding table for a charset other than UTF-8. 1340 * This function is automatically called by the case folding functions. 1341 * 1342 * @param string Charset for which to initialize case folding. 1343 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1344 * @access private 1345 */ 1346 function initCaseFolding($charset) { 1347 // Only process if the case table is not yet loaded: 1348 if (is_array($this->caseFolding[$charset])) { 1349 return 1; 1350 } 1351 1352 // Use cached version if possible 1353 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl'); 1354 if ($cacheFile && @is_file($cacheFile)) { 1355 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1356 return 2; 1357 } 1358 1359 // init UTF-8 conversion for this charset 1360 if (!$this->initCharset($charset)) { 1361 return FALSE; 1362 } 1363 1364 // UTF-8 case folding is used as the base conversion table 1365 if (!$this->initUnicodeData('case')) { 1366 return FALSE; 1367 } 1368 1369 $nochar = chr($this->noCharByteVal); 1370 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 1371 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 1372 $c = $this->utf8_decode($utf8, $charset); 1373 1374 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 1375 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 1376 if ($cc != '' && $cc != $nochar) { 1377 $this->caseFolding[$charset]['toUpper'][$c] = $cc; 1378 } 1379 1380 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 1381 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 1382 if ($cc != '' && $cc != $nochar) { 1383 $this->caseFolding[$charset]['toLower'][$c] = $cc; 1384 } 1385 1386 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 1387 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 1388 if ($cc != '' && $cc != $nochar) { 1389 $this->caseFolding[$charset]['toTitle'][$c] = $cc; 1390 } 1391 } 1392 1393 // add the ASCII case table 1394 for ($i = ord('a'); $i <= ord('z'); $i++) { 1395 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32); 1396 } 1397 for ($i = ord('A'); $i <= ord('Z'); $i++) { 1398 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32); 1399 } 1400 1401 if ($cacheFile) { 1402 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset])); 1403 } 1404 1405 return 3; 1406 } 1407 1408 /** 1409 * This function initializes the to-ASCII conversion table for a charset other than UTF-8. 1410 * This function is automatically called by the ASCII transliteration functions. 1411 * 1412 * @param string Charset for which to initialize conversion. 1413 * @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached). 1414 * @access private 1415 */ 1416 function initToASCII($charset) { 1417 // Only process if the case table is not yet loaded: 1418 if (is_array($this->toASCII[$charset])) { 1419 return 1; 1420 } 1421 1422 // Use cached version if possible 1423 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl'); 1424 if ($cacheFile && @is_file($cacheFile)) { 1425 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 1426 return 2; 1427 } 1428 1429 // init UTF-8 conversion for this charset 1430 if (!$this->initCharset($charset)) { 1431 return FALSE; 1432 } 1433 1434 // UTF-8/ASCII transliteration is used as the base conversion table 1435 if (!$this->initUnicodeData('ascii')) { 1436 return FALSE; 1437 } 1438 1439 $nochar = chr($this->noCharByteVal); 1440 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 1441 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 1442 $c = $this->utf8_decode($utf8, $charset); 1443 1444 if (isset($this->toASCII['utf-8'][$utf8])) { 1445 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 1446 } 1447 } 1448 1449 if ($cacheFile) { 1450 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset])); 1451 } 1452 1453 return 3; 1454 } 1455 1456 1457 /******************************************** 1458 * 1459 * String operation functions 1460 * 1461 ********************************************/ 1462 1463 /** 1464 * Returns a part of a string. 1465 * Unit-tested by Kasper (single byte charsets only) 1466 * 1467 * @param string The character set 1468 * @param string Character string 1469 * @param integer Start position (character position) 1470 * @param integer Length (in characters) 1471 * @return string The substring 1472 * @see substr(), mb_substr() 1473 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1474 */ 1475 function substr($charset, $string, $start, $len = NULL) { 1476 if ($len === 0 || $string === '') { 1477 return ''; 1478 } 1479 1480 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1481 // cannot omit $len, when specifying charset 1482 if ($len == NULL) { 1483 $enc = mb_internal_encoding(); // save internal encoding 1484 mb_internal_encoding($charset); 1485 $str = mb_substr($string, $start); 1486 mb_internal_encoding($enc); // restore internal encoding 1487 1488 return $str; 1489 } 1490 else { 1491 return mb_substr($string, $start, $len, $charset); 1492 } 1493 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1494 // cannot omit $len, when specifying charset 1495 if ($len == NULL) { 1496 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding 1497 iconv_set_encoding('internal_encoding', $charset); 1498 $str = iconv_substr($string, $start); 1499 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding 1500 1501 return $str; 1502 } 1503 else { 1504 return iconv_substr($string, $start, $len, $charset); 1505 } 1506 } elseif ($charset == 'utf-8') { 1507 return $this->utf8_substr($string, $start, $len); 1508 } elseif ($this->eucBasedSets[$charset]) { 1509 return $this->euc_substr($string, $start, $charset, $len); 1510 } elseif ($this->twoByteSets[$charset]) { 1511 return substr($string, $start * 2, $len * 2); 1512 } elseif ($this->fourByteSets[$charset]) { 1513 return substr($string, $start * 4, $len * 4); 1514 } 1515 1516 // treat everything else as single-byte encoding 1517 return $len === NULL ? substr($string, $start) : substr($string, $start, $len); 1518 } 1519 1520 /** 1521 * Counts the number of characters. 1522 * Unit-tested by Kasper (single byte charsets only) 1523 * 1524 * @param string The character set 1525 * @param string Character string 1526 * @return integer The number of characters 1527 * @see strlen() 1528 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1529 */ 1530 function strlen($charset, $string) { 1531 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1532 return mb_strlen($string, $charset); 1533 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1534 return iconv_strlen($string, $charset); 1535 } elseif ($charset == 'utf-8') { 1536 return $this->utf8_strlen($string); 1537 } elseif ($this->eucBasedSets[$charset]) { 1538 return $this->euc_strlen($string, $charset); 1539 } elseif ($this->twoByteSets[$charset]) { 1540 return strlen($string) / 2; 1541 } elseif ($this->fourByteSets[$charset]) { 1542 return strlen($string) / 4; 1543 } 1544 // treat everything else as single-byte encoding 1545 return strlen($string); 1546 } 1547 1548 /** 1549 * Method to crop strings using the mb_substr function. 1550 * 1551 * @param string The character set 1552 * @param string String to be cropped 1553 * @param integer Crop length (in characters) 1554 * @param string Crop signifier 1555 * @return string The shortened string 1556 * @see mb_strlen(), mb_substr() 1557 */ 1558 protected function cropMbstring($charset, $string, $len, $crop = '') { 1559 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) { 1560 return $string; 1561 } 1562 1563 if ($len > 0) { 1564 $string = mb_substr($string, 0, $len, $charset) . $crop; 1565 } else { 1566 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset); 1567 } 1568 1569 return $string; 1570 } 1571 1572 /** 1573 * Truncates a string and pre-/appends a string. 1574 * Unit tested by Kasper 1575 * 1576 * @param string The character set 1577 * @param string Character string 1578 * @param integer Length (in characters) 1579 * @param string Crop signifier 1580 * @return string The shortened string 1581 * @see substr(), mb_strimwidth() 1582 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1583 */ 1584 function crop($charset, $string, $len, $crop = '') { 1585 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1586 return $this->cropMbstring($charset, $string, $len, $crop); 1587 } 1588 1589 if (intval($len) == 0) { 1590 return $string; 1591 } 1592 1593 if ($charset == 'utf-8') { 1594 $i = $this->utf8_char2byte_pos($string, $len); 1595 } elseif ($this->eucBasedSets[$charset]) { 1596 $i = $this->euc_char2byte_pos($string, $len, $charset); 1597 } else { 1598 if ($len > 0) { 1599 $i = $len; 1600 } else { 1601 $i = strlen($string) + $len; 1602 if ($i <= 0) { 1603 $i = FALSE; 1604 } 1605 } 1606 } 1607 1608 if ($i === FALSE) { // $len outside actual string length 1609 return $string; 1610 } else { 1611 if ($len > 0) { 1612 if (strlen($string[$i])) { 1613 return substr($string, 0, $i) . $crop; 1614 1615 } 1616 } else { 1617 if (strlen($string[$i - 1])) { 1618 return $crop . substr($string, $i); 1619 } 1620 } 1621 1622 /* 1623 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...) 1624 if ($len > 0) { 1625 return substr($string,0,$i).$crop; 1626 } else { 1627 return $crop.substr($string,$i); 1628 } 1629 } 1630 */ 1631 } 1632 return $string; 1633 } 1634 1635 /** 1636 * Cuts a string short at a given byte length. 1637 * 1638 * @param string The character set 1639 * @param string Character string 1640 * @param integer The byte length 1641 * @return string The shortened string 1642 * @see mb_strcut() 1643 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1644 */ 1645 function strtrunc($charset, $string, $len) { 1646 if ($len <= 0) { 1647 return ''; 1648 } 1649 1650 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1651 return mb_strcut($string, 0, $len, $charset); 1652 } elseif ($charset == 'utf-8') { 1653 return $this->utf8_strtrunc($string, $len); 1654 } elseif ($this->eucBasedSets[$charset]) { 1655 return $this->euc_strtrunc($string, $len, $charset); 1656 } elseif ($this->twoByteSets[$charset]) { 1657 if ($len % 2) { 1658 $len--; 1659 } // don't cut at odd positions 1660 } elseif ($this->fourByteSets[$charset]) { 1661 $x = $len % 4; 1662 $len -= $x; // realign to position dividable by four 1663 } 1664 // treat everything else as single-byte encoding 1665 return substr($string, 0, $len); 1666 } 1667 1668 /** 1669 * Translates all characters of a string into their respective case values. 1670 * Unlike strtolower() and strtoupper() this method is locale independent. 1671 * Note that the string length may change! 1672 * eg. lower case German "ß" (sharp S) becomes upper case "SS" 1673 * Unit-tested by Kasper 1674 * Real case folding is language dependent, this method ignores this fact. 1675 * 1676 * @param string Character set of string 1677 * @param string Input string to convert case for 1678 * @param string Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" ) 1679 * @return string The converted string 1680 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1681 * @see strtolower(), strtoupper() 1682 */ 1683 function conv_case($charset, $string, $case) { 1684 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1685 if ($case == 'toLower') { 1686 $string = mb_strtolower($string, $charset); 1687 } else { 1688 $string = mb_strtoupper($string, $charset); 1689 } 1690 } elseif ($charset == 'utf-8') { 1691 $string = $this->utf8_char_mapping($string, 'case', $case); 1692 } elseif (isset($this->eucBasedSets[$charset])) { 1693 $string = $this->euc_char_mapping($string, $charset, 'case', $case); 1694 } else { 1695 // treat everything else as single-byte encoding 1696 $string = $this->sb_char_mapping($string, $charset, 'case', $case); 1697 } 1698 1699 return $string; 1700 } 1701 1702 /** 1703 * Equivalent of lcfirst/ucfirst but using character set. 1704 * 1705 * @param string $charset 1706 * @param string $string 1707 * @param string $case 1708 * @return string 1709 * @see t3lib_cs::conv_case() 1710 */ 1711 public function convCaseFirst($charset, $string, $case) { 1712 $firstChar = $this->substr($charset, $string, 0, 1); 1713 $firstChar = $this->conv_case($charset, $firstChar, $case); 1714 $remainder = $this->substr($charset, $string, 1); 1715 return $firstChar . $remainder; 1716 } 1717 1718 /** 1719 * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.) 1720 * 1721 * @param string $charset Character set of string 1722 * @param string $string Input string to convert 1723 * @return string The converted string 1724 */ 1725 function specCharsToASCII($charset, $string) { 1726 if ($charset == 'utf-8') { 1727 $string = $this->utf8_char_mapping($string, 'ascii'); 1728 } elseif (isset($this->eucBasedSets[$charset])) { 1729 $string = $this->euc_char_mapping($string, $charset, 'ascii'); 1730 } else { 1731 // treat everything else as single-byte encoding 1732 $string = $this->sb_char_mapping($string, $charset, 'ascii'); 1733 } 1734 1735 return $string; 1736 } 1737 1738 1739 /** 1740 * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE) 1741 * into a TYPO3-readable language code 1742 * @param $languageCodesList list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1' 1743 * see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4 1744 * @return string a preferred language that TYPO3 supports, or "default" if none found 1745 * @author Benjamin Mack (benni.typo3.org) 1746 */ 1747 public function getPreferredClientLanguage($languageCodesList) { 1748 $allLanguageCodes = array(); 1749 $selectedLanguage = 'default'; 1750 1751 // get all languages where TYPO3 code is the same as the ISO code 1752 foreach ($this->charSetArray as $typo3Lang => $charSet) { 1753 $allLanguageCodes[$typo3Lang] = $typo3Lang; 1754 } 1755 1756 // get all languages where TYPO3 code differs from ISO code 1757 // or needs the country part 1758 // the iso codes will here overwrite the default typo3 language in the key 1759 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) { 1760 $isoLang = join('-', explode('_', $isoLang)); 1761 $allLanguageCodes[$typo3Lang] = $isoLang; 1762 } 1763 1764 // move the iso codes to the (because we're comparing the keys with "isset" later on) 1765 $allLanguageCodes = array_flip($allLanguageCodes); 1766 1767 1768 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList); 1769 // order the preferred languages after they key 1770 $sortedPreferredLanguages = array(); 1771 foreach ($preferredLanguages as $preferredLanguage) { 1772 $quality = 1.0; 1773 if (strpos($preferredLanguage, ';q=') !== FALSE) { 1774 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage); 1775 } 1776 $sortedPreferredLanguages[$preferredLanguage] = $quality; 1777 } 1778 1779 // loop through the languages, with the highest priority first 1780 arsort($sortedPreferredLanguages, SORT_NUMERIC); 1781 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) { 1782 if (isset($allLanguageCodes[$preferredLanguage])) { 1783 $selectedLanguage = $allLanguageCodes[$preferredLanguage]; 1784 break; 1785 } 1786 1787 // strip the country code from the end 1788 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage); 1789 if (isset($allLanguageCodes[$preferredLanguage])) { 1790 $selectedLanguage = $allLanguageCodes[$preferredLanguage]; 1791 break; 1792 } 1793 } 1794 if (!$selectedLanguage || $selectedLanguage == 'en') { 1795 $selectedLanguage = 'default'; 1796 } 1797 return $selectedLanguage; 1798 } 1799 1800 1801 /******************************************** 1802 * 1803 * Internal string operation functions 1804 * 1805 ********************************************/ 1806 1807 /** 1808 * Maps all characters of a string in a single byte charset. 1809 * 1810 * @param string the string 1811 * @param string the charset 1812 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 1813 * @param string 'case': conversion 'toLower' or 'toUpper' 1814 * @return string the converted string 1815 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1816 */ 1817 function sb_char_mapping($str, $charset, $mode, $opt = '') { 1818 switch ($mode) { 1819 case 'case': 1820 if (!$this->initCaseFolding($charset)) { 1821 return $str; 1822 } // do nothing 1823 $map =& $this->caseFolding[$charset][$opt]; 1824 break; 1825 1826 case 'ascii': 1827 if (!$this->initToASCII($charset)) { 1828 return $str; 1829 } // do nothing 1830 $map =& $this->toASCII[$charset]; 1831 break; 1832 1833 default: 1834 return $str; 1835 } 1836 1837 $out = ''; 1838 for ($i = 0; strlen($str[$i]); $i++) { 1839 $c = $str[$i]; 1840 if (isset($map[$c])) { 1841 $out .= $map[$c]; 1842 } else { 1843 $out .= $c; 1844 } 1845 } 1846 1847 return $out; 1848 } 1849 1850 1851 /******************************************** 1852 * 1853 * Internal UTF-8 string operation functions 1854 * 1855 ********************************************/ 1856 1857 /** 1858 * Returns a part of a UTF-8 string. 1859 * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len 1860 * 1861 * @param string UTF-8 string 1862 * @param integer Start position (character position) 1863 * @param integer Length (in characters) 1864 * @return string The substring 1865 * @see substr() 1866 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1867 */ 1868 function utf8_substr($str, $start, $len = NULL) { 1869 if (!strcmp($len, '0')) { 1870 return ''; 1871 } 1872 1873 $byte_start = $this->utf8_char2byte_pos($str, $start); 1874 if ($byte_start === FALSE) { 1875 if ($start > 0) { 1876 return FALSE; // $start outside string length 1877 } else { 1878 $start = 0; 1879 } 1880 } 1881 1882 $str = substr($str, $byte_start); 1883 1884 if ($len != NULL) { 1885 $byte_end = $this->utf8_char2byte_pos($str, $len); 1886 if ($byte_end === FALSE) // $len outside actual string length 1887 { 1888 return $len < 0 ? '' : $str; 1889 } // When length is less than zero and exceeds, then we return blank string. 1890 else 1891 { 1892 return substr($str, 0, $byte_end); 1893 } 1894 } 1895 else { 1896 return $str; 1897 } 1898 } 1899 1900 /** 1901 * Counts the number of characters of a string in UTF-8. 1902 * Unit-tested by Kasper and works 100% like strlen() / mb_strlen() 1903 * 1904 * @param string UTF-8 multibyte character string 1905 * @return integer The number of characters 1906 * @see strlen() 1907 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1908 */ 1909 function utf8_strlen($str) { 1910 $n = 0; 1911 for ($i = 0; strlen($str[$i]); $i++) { 1912 $c = ord($str[$i]); 1913 if (!($c & 0x80)) // single-byte (0xxxxxx) 1914 { 1915 $n++; 1916 } 1917 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 1918 { 1919 $n++; 1920 } 1921 } 1922 return $n; 1923 } 1924 1925 /** 1926 * Truncates a string in UTF-8 short at a given byte length. 1927 * 1928 * @param string UTF-8 multibyte character string 1929 * @param integer the byte length 1930 * @return string the shortened string 1931 * @see mb_strcut() 1932 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1933 */ 1934 function utf8_strtrunc($str, $len) { 1935 $i = $len - 1; 1936 if (ord($str[$i]) & 0x80) { // part of a multibyte sequence 1937 for (; $i > 0 && !(ord($str[$i]) & 0x40); $i--) { 1938 // find the first byte 1939 ; 1940 } 1941 if ($i <= 0) { 1942 return ''; 1943 } // sanity check 1944 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) { 1945 // calculate number of bytes 1946 $bc++; 1947 } 1948 if ($bc + $i > $len) { 1949 return substr($str, 0, $i); 1950 } 1951 // fallthru: multibyte char fits into length 1952 } 1953 return substr($str, 0, $len); 1954 } 1955 1956 /** 1957 * Find position of first occurrence of a string, both arguments are in UTF-8. 1958 * 1959 * @param string UTF-8 string to search in 1960 * @param string UTF-8 string to search for 1961 * @param integer Positition to start the search 1962 * @return integer The character position 1963 * @see strpos() 1964 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1965 */ 1966 function utf8_strpos($haystack, $needle, $offset = 0) { 1967 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1968 return mb_strpos($haystack, $needle, $offset, 'utf-8'); 1969 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1970 return iconv_strpos($haystack, $needle, $offset, 'utf-8'); 1971 } 1972 1973 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset); 1974 if ($byte_offset === FALSE) { 1975 return FALSE; 1976 } // offset beyond string length 1977 1978 $byte_pos = strpos($haystack, $needle, $byte_offset); 1979 if ($byte_pos === FALSE) { 1980 return FALSE; 1981 } // needle not found 1982 1983 return $this->utf8_byte2char_pos($haystack, $byte_pos); 1984 } 1985 1986 /** 1987 * Find position of last occurrence of a char in a string, both arguments are in UTF-8. 1988 * 1989 * @param string UTF-8 string to search in 1990 * @param string UTF-8 character to search for (single character) 1991 * @return integer The character position 1992 * @see strrpos() 1993 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 1994 */ 1995 function utf8_strrpos($haystack, $needle) { 1996 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 1997 return mb_strrpos($haystack, $needle, 'utf-8'); 1998 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 1999 return iconv_strrpos($haystack, $needle, 'utf-8'); 2000 } 2001 2002 $byte_pos = strrpos($haystack, $needle); 2003 if ($byte_pos === FALSE) { 2004 return FALSE; 2005 } // needle not found 2006 2007 return $this->utf8_byte2char_pos($haystack, $byte_pos); 2008 } 2009 2010 /** 2011 * Translates a character position into an 'absolute' byte position. 2012 * Unit tested by Kasper. 2013 * 2014 * @param string UTF-8 string 2015 * @param integer Character position (negative values start from the end) 2016 * @return integer Byte position 2017 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2018 */ 2019 function utf8_char2byte_pos($str, $pos) { 2020 $n = 0; // number of characters found 2021 $p = abs($pos); // number of characters wanted 2022 2023 if ($pos >= 0) { 2024 $i = 0; 2025 $d = 1; 2026 } else { 2027 $i = strlen($str) - 1; 2028 $d = -1; 2029 } 2030 2031 for (; strlen($str[$i]) && $n < $p; $i += $d) { 2032 $c = (int) ord($str[$i]); 2033 if (!($c & 0x80)) // single-byte (0xxxxxx) 2034 { 2035 $n++; 2036 } 2037 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 2038 { 2039 $n++; 2040 } 2041 } 2042 if (!strlen($str[$i])) { 2043 return FALSE; 2044 } // offset beyond string length 2045 2046 if ($pos >= 0) { 2047 // skip trailing multi-byte data bytes 2048 while ((ord($str[$i]) & 0x80) && !(ord($str[$i]) & 0x40)) { 2049 $i++; 2050 } 2051 } else { 2052 // correct offset 2053 $i++; 2054 } 2055 2056 return $i; 2057 } 2058 2059 /** 2060 * Translates an 'absolute' byte position into a character position. 2061 * Unit tested by Kasper. 2062 * 2063 * @param string UTF-8 string 2064 * @param integer byte position 2065 * @return integer character position 2066 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2067 */ 2068 function utf8_byte2char_pos($str, $pos) { 2069 $n = 0; // number of characters 2070 for ($i = $pos; $i > 0; $i--) { 2071 $c = (int) ord($str[$i]); 2072 if (!($c & 0x80)) // single-byte (0xxxxxx) 2073 { 2074 $n++; 2075 } 2076 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 2077 { 2078 $n++; 2079 } 2080 } 2081 if (!strlen($str[$i])) { 2082 return FALSE; 2083 } // offset beyond string length 2084 2085 return $n; 2086 } 2087 2088 /** 2089 * Maps all characters of an UTF-8 string. 2090 * 2091 * @param string UTF-8 string 2092 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 2093 * @param string 'case': conversion 'toLower' or 'toUpper' 2094 * @return string the converted string 2095 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2096 */ 2097 function utf8_char_mapping($str, $mode, $opt = '') { 2098 if (!$this->initUnicodeData($mode)) { 2099 return $str; 2100 } // do nothing 2101 2102 $out = ''; 2103 switch ($mode) { 2104 case 'case': 2105 $map =& $this->caseFolding['utf-8'][$opt]; 2106 break; 2107 2108 case 'ascii': 2109 $map =& $this->toASCII['utf-8']; 2110 break; 2111 2112 default: 2113 return $str; 2114 } 2115 2116 for ($i = 0; strlen($str[$i]); $i++) { 2117 $c = ord($str[$i]); 2118 if (!($c & 0x80)) // single-byte (0xxxxxx) 2119 { 2120 $mbc = $str[$i]; 2121 } 2122 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 2123 for ($bc = 0; $c & 0x80; $c = $c << 1) { 2124 $bc++; 2125 } // calculate number of bytes 2126 $mbc = substr($str, $i, $bc); 2127 $i += $bc - 1; 2128 } 2129 2130 if (isset($map[$mbc])) { 2131 $out .= $map[$mbc]; 2132 } else { 2133 $out .= $mbc; 2134 } 2135 } 2136 2137 return $out; 2138 } 2139 2140 2141 /******************************************** 2142 * 2143 * Internal EUC string operation functions 2144 * 2145 * Extended Unix Code: 2146 * ASCII compatible 7bit single bytes chars 2147 * 8bit two byte chars 2148 * 2149 * Shift-JIS is treated as a special case. 2150 * 2151 ********************************************/ 2152 2153 /** 2154 * Cuts a string in the EUC charset family short at a given byte length. 2155 * 2156 * @param string EUC multibyte character string 2157 * @param integer the byte length 2158 * @param string the charset 2159 * @return string the shortened string 2160 * @see mb_strcut() 2161 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2162 */ 2163 function euc_strtrunc($str, $len, $charset) { 2164 $sjis = ($charset == 'shift_jis'); 2165 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) { 2166 $c = ord($str[$i]); 2167 if ($sjis) { 2168 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 2169 $i++; 2170 } // advance a double-byte char 2171 } 2172 else { 2173 if ($c >= 0x80) { 2174 $i++; 2175 } // advance a double-byte char 2176 } 2177 } 2178 if (!strlen($str[$i])) { 2179 return $str; 2180 } // string shorter than supplied length 2181 2182 if ($i > $len) { 2183 return substr($str, 0, $len - 1); // we ended on a first byte 2184 } else { 2185 return substr($str, 0, $len); 2186 } 2187 } 2188 2189 /** 2190 * Returns a part of a string in the EUC charset family. 2191 * 2192 * @param string EUC multibyte character string 2193 * @param integer start position (character position) 2194 * @param string the charset 2195 * @param integer length (in characters) 2196 * @return string the substring 2197 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2198 */ 2199 function euc_substr($str, $start, $charset, $len = NULL) { 2200 $byte_start = $this->euc_char2byte_pos($str, $start, $charset); 2201 if ($byte_start === FALSE) { 2202 return FALSE; 2203 } // $start outside string length 2204 2205 $str = substr($str, $byte_start); 2206 2207 if ($len != NULL) { 2208 $byte_end = $this->euc_char2byte_pos($str, $len, $charset); 2209 if ($byte_end === FALSE) // $len outside actual string length 2210 { 2211 return $str; 2212 } 2213 else 2214 { 2215 return substr($str, 0, $byte_end); 2216 } 2217 } 2218 else { 2219 return $str; 2220 } 2221 } 2222 2223 /** 2224 * Counts the number of characters of a string in the EUC charset family. 2225 * 2226 * @param string EUC multibyte character string 2227 * @param string the charset 2228 * @return integer the number of characters 2229 * @see strlen() 2230 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2231 */ 2232 function euc_strlen($str, $charset) { 2233 $sjis = ($charset == 'shift_jis'); 2234 $n = 0; 2235 for ($i = 0; strlen($str[$i]); $i++) { 2236 $c = ord($str[$i]); 2237 if ($sjis) { 2238 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 2239 $i++; 2240 } // advance a double-byte char 2241 } 2242 else { 2243 if ($c >= 0x80) { 2244 $i++; 2245 } // advance a double-byte char 2246 } 2247 2248 $n++; 2249 } 2250 2251 return $n; 2252 } 2253 2254 /** 2255 * Translates a character position into an 'absolute' byte position. 2256 * 2257 * @param string EUC multibyte character string 2258 * @param integer character position (negative values start from the end) 2259 * @param string the charset 2260 * @return integer byte position 2261 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2262 */ 2263 function euc_char2byte_pos($str, $pos, $charset) { 2264 $sjis = ($charset == 'shift_jis'); 2265 $n = 0; // number of characters seen 2266 $p = abs($pos); // number of characters wanted 2267 2268 if ($pos >= 0) { 2269 $i = 0; 2270 $d = 1; 2271 } else { 2272 $i = strlen($str) - 1; 2273 $d = -1; 2274 } 2275 2276 for (; strlen($str[$i]) && $n < $p; $i += $d) { 2277 $c = ord($str[$i]); 2278 if ($sjis) { 2279 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { 2280 $i += $d; 2281 } // advance a double-byte char 2282 } 2283 else { 2284 if ($c >= 0x80) { 2285 $i += $d; 2286 } // advance a double-byte char 2287 } 2288 2289 $n++; 2290 } 2291 if (!strlen($str[$i])) { 2292 return FALSE; 2293 } // offset beyond string length 2294 2295 if ($pos < 0) { 2296 $i++; 2297 } // correct offset 2298 2299 return $i; 2300 } 2301 2302 /** 2303 * Maps all characters of a string in the EUC charset family. 2304 * 2305 * @param string EUC multibyte character string 2306 * @param string the charset 2307 * @param string mode: 'case' (case folding) or 'ascii' (ASCII transliteration) 2308 * @param string 'case': conversion 'toLower' or 'toUpper' 2309 * @return string the converted string 2310 * @author Martin Kutschker <martin.t.kutschker@blackbox.net> 2311 */ 2312 function euc_char_mapping($str, $charset, $mode, $opt = '') { 2313 switch ($mode) { 2314 case 'case': 2315 if (!$this->initCaseFolding($charset)) { 2316 return $str; 2317 } // do nothing 2318 $map =& $this->caseFolding[$charset][$opt]; 2319 break; 2320 2321 case 'ascii': 2322 if (!$this->initToASCII($charset)) { 2323 return $str; 2324 } // do nothing 2325 $map =& $this->toASCII[$charset]; 2326 break; 2327 2328 default: 2329 return $str; 2330 } 2331 2332 $sjis = ($charset == 'shift_jis'); 2333 $out = ''; 2334 for ($i = 0; strlen($str[$i]); $i++) { 2335 $mbc = $str[$i]; 2336 $c = ord($mbc); 2337 2338 if ($sjis) { 2339 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 2340 $mbc = substr($str, $i, 2); 2341 $i++; 2342 } 2343 } 2344 else { 2345 if ($c >= 0x80) { // a double-byte char 2346 $mbc = substr($str, $i, 2); 2347 $i++; 2348 } 2349 } 2350 2351 if (isset($map[$mbc])) { 2352 $out .= $map[$mbc]; 2353 } else { 2354 $out .= $mbc; 2355 } 2356 } 2357 2358 return $out; 2359 } 2360 2361 } 2362 2363 if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) { 2364 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 2365 } 2366 2367 ?>
title
Description
Body
title
Description
Body
title
Description
Body
title
Body