Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 3.11.x will end 14 Nov 2022 (12 months plus 6 months extension).
  • Bug fixes for security issues in 3.11.x will end 13 Nov 2023 (18 months plus 12 months extension).
  • PHP version: minimum PHP 7.3.0 Note: minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is supported too.
   1  <?php
   2  /***************************************************************
   3   *  Copyright notice
   4   *
   5   *  (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
   6   *  All rights reserved
   7   *
   8   *  This script is part of the Typo3 project. The Typo3 project is
   9   *  free software; you can redistribute it and/or modify
  10   *  it under the terms of the GNU General Public License as published by
  11   *  the Free Software Foundation; either version 2 of the License, or
  12   *  (at your option) any later version.
  13   *
  14   *  The GNU General Public License can be found at
  15   *  http://www.gnu.org/copyleft/gpl.html.
  16   *
  17   *  This script is distributed in the hope that it will be useful,
  18   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20   *  GNU General Public License for more details.
  21   *
  22   *  This copyright notice MUST APPEAR in all copies of the script!
  23   ***************************************************************/
  24  /**
  25   * Class for conversion between charsets.
  26   *
  27   * @author	 Kasper Skårhøj <kasperYYYY@typo3.com>
  28   * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
  29   */
  30  
  31  
  32  /**
  33   * Notes on UTF-8
  34   *
  35   * Functions working on UTF-8 strings:
  36   *
  37   * - strchr/strstr
  38   * - strrchr
  39   * - substr_count
  40   * - implode/explode/join
  41   *
  42   * Functions nearly working on UTF-8 strings:
  43   *
  44   * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  45   * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  46   * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  47   * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  48   * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  49   *
  50   * Functions NOT working on UTF-8 strings:
  51   *
  52   * - str*cmp
  53   * - stristr
  54   * - stripos
  55   * - substr
  56   * - strrev
  57   * - split/spliti
  58   * - ...
  59   *
  60   */
  61  /**
  62   * Class for conversion between charsets
  63   *
  64   * @author	 Kasper Skårhøj <kasperYYYY@typo3.com>
  65   * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
  66   * @package TYPO3
  67   * @subpackage t3lib
  68   */
  69  class t3lib_cs {
  70  
  71  	 /**
  72  	  * @var t3lib_l10n_Locales
  73  	  */
  74  	 protected $locales;
  75  
  76  	 var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  77  
  78  	 	 // This is the array where parsed conversion tables are stored (cached)
  79  	 var $parsedCharsets = array();
  80  
  81  	 	 // An array where case folding data will be stored (cached)
  82  	 var $caseFolding = array();
  83  
  84  	 	 // An array where charset-to-ASCII mappings are stored (cached)
  85  	 var $toASCII = array();
  86  
  87  	 	 // This tells the converter which charsets has two bytes per char:
  88  	 var $twoByteSets = array(
  89  	 	 'ucs-2' => 1, // 2-byte Unicode
  90  	 );
  91  
  92  	 	 // This tells the converter which charsets has four bytes per char:
  93  	 var $fourByteSets = array(
  94  	 	 'ucs-4' => 1, // 4-byte Unicode
  95  	 	 'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  96  	 );
  97  
  98  	 	 // This tells the converter which charsets use a scheme like the Extended Unix Code:
  99  	 var $eucBasedSets = array(
 100  	 	 'gb2312' => 1, // Chinese, simplified.
 101  	 	 'big5' => 1, // Chinese, traditional.
 102  	 	 'euc-kr' => 1, // Korean
 103  	 	 'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 104  	 );
 105  
 106  	 	 // see	 http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 107  	 	 // http://czyborra.com/charsets/iso8859.html
 108  	 var $synonyms = array(
 109  	 	 'us' => 'ascii',
 110  	 	 'us-ascii' => 'ascii',
 111  	 	 'cp819' => 'iso-8859-1',
 112  	 	 'ibm819' => 'iso-8859-1',
 113  	 	 'iso-ir-100' => 'iso-8859-1',
 114  	 	 'iso-ir-101' => 'iso-8859-2',
 115  	 	 'iso-ir-109' => 'iso-8859-3',
 116  	 	 'iso-ir-110' => 'iso-8859-4',
 117  	 	 'iso-ir-144' => 'iso-8859-5',
 118  	 	 'iso-ir-127' => 'iso-8859-6',
 119  	 	 'iso-ir-126' => 'iso-8859-7',
 120  	 	 'iso-ir-138' => 'iso-8859-8',
 121  	 	 'iso-ir-148' => 'iso-8859-9',
 122  	 	 'iso-ir-157' => 'iso-8859-10',
 123  	 	 'iso-ir-179' => 'iso-8859-13',
 124  	 	 'iso-ir-199' => 'iso-8859-14',
 125  	 	 'iso-ir-203' => 'iso-8859-15',
 126  	 	 'csisolatin1' => 'iso-8859-1',
 127  	 	 'csisolatin2' => 'iso-8859-2',
 128  	 	 'csisolatin3' => 'iso-8859-3',
 129  	 	 'csisolatin5' => 'iso-8859-9',
 130  	 	 'csisolatin8' => 'iso-8859-14',
 131  	 	 'csisolatin9' => 'iso-8859-15',
 132  	 	 'csisolatingreek' => 'iso-8859-7',
 133  	 	 'iso-celtic' => 'iso-8859-14',
 134  	 	 'latin1' => 'iso-8859-1',
 135  	 	 'latin2' => 'iso-8859-2',
 136  	 	 'latin3' => 'iso-8859-3',
 137  	 	 'latin5' => 'iso-8859-9',
 138  	 	 'latin6' => 'iso-8859-10',
 139  	 	 'latin8' => 'iso-8859-14',
 140  	 	 'latin9' => 'iso-8859-15',
 141  	 	 'l1' => 'iso-8859-1',
 142  	 	 'l2' => 'iso-8859-2',
 143  	 	 'l3' => 'iso-8859-3',
 144  	 	 'l5' => 'iso-8859-9',
 145  	 	 'l6' => 'iso-8859-10',
 146  	 	 'l8' => 'iso-8859-14',
 147  	 	 'l9' => 'iso-8859-15',
 148  	 	 'cyrillic' => 'iso-8859-5',
 149  	 	 'arabic' => 'iso-8859-6',
 150  	 	 'tis-620' => 'iso-8859-11',
 151  	 	 'win874' => 'windows-874',
 152  	 	 'win1250' => 'windows-1250',
 153  	 	 'win1251' => 'windows-1251',
 154  	 	 'win1252' => 'windows-1252',
 155  	 	 'win1253' => 'windows-1253',
 156  	 	 'win1254' => 'windows-1254',
 157  	 	 'win1255' => 'windows-1255',
 158  	 	 'win1256' => 'windows-1256',
 159  	 	 'win1257' => 'windows-1257',
 160  	 	 'win1258' => 'windows-1258',
 161  	 	 'cp1250' => 'windows-1250',
 162  	 	 'cp1251' => 'windows-1251',
 163  	 	 'cp1252' => 'windows-1252',
 164  	 	 'ms-ee' => 'windows-1250',
 165  	 	 'ms-ansi' => 'windows-1252',
 166  	 	 'ms-greek' => 'windows-1253',
 167  	 	 'ms-turk' => 'windows-1254',
 168  	 	 'winbaltrim' => 'windows-1257',
 169  	 	 'koi-8ru' => 'koi-8r',
 170  	 	 'koi8r' => 'koi-8r',
 171  	 	 'cp878' => 'koi-8r',
 172  	 	 'mac' => 'macroman',
 173  	 	 'macintosh' => 'macroman',
 174  	 	 'euc-cn' => 'gb2312',
 175  	 	 'x-euc-cn' => 'gb2312',
 176  	 	 'euccn' => 'gb2312',
 177  	 	 'cp936' => 'gb2312',
 178  	 	 'big-5' => 'big5',
 179  	 	 'cp950' => 'big5',
 180  	 	 'eucjp' => 'euc-jp',
 181  	 	 'sjis' => 'shift_jis',
 182  	 	 'shift-jis' => 'shift_jis',
 183  	 	 'cp932' => 'shift_jis',
 184  	 	 'cp949' => 'euc-kr',
 185  	 	 'utf7' => 'utf-7',
 186  	 	 'utf8' => 'utf-8',
 187  	 	 'utf16' => 'utf-16',
 188  	 	 'utf32' => 'utf-32',
 189  	 	 'utf8' => 'utf-8',
 190  	 	 'ucs2' => 'ucs-2',
 191  	 	 'ucs4' => 'ucs-4',
 192  	 );
 193  
 194  	 	 // mapping of iso-639-1 language codes to script names
 195  	 var $lang_to_script = array(
 196  	 	 	 // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
 197  	 	 'af' => 'west_european', //Afrikaans
 198  	 	 'ar' => 'arabic',
 199  	 	 'bg' => 'cyrillic', // Bulgarian
 200  	 	 'bs' => 'east_european', // Bosnian
 201  	 	 'cs' => 'east_european', // Czech
 202  	 	 'da' => 'west_european', // Danish
 203  	 	 'de' => 'west_european', // German
 204  	 	 'es' => 'west_european', // Spanish
 205  	 	 'et' => 'estonian',
 206  	 	 'eo' => 'unicode', // Esperanto
 207  	 	 'eu' => 'west_european', // Basque
 208  	 	 'fa' => 'arabic', // Persian
 209  	 	 'fi' => 'west_european', // Finish
 210  	 	 'fo' => 'west_european', // Faroese
 211  	 	 'fr' => 'west_european', // French
 212  	 	 'ga' => 'west_european', // Irish
 213  	 	 'gl' => 'west_european', // Galician
 214  	 	 'gr' => 'greek',
 215  	 	 'he' => 'hebrew', // Hebrew (since 1998)
 216  	 	 'hi' => 'unicode', // Hindi
 217  	 	 'hr' => 'east_european', // Croatian
 218  	 	 'hu' => 'east_european', // Hungarian
 219  	 	 'iw' => 'hebrew', // Hebrew (til 1998)
 220  	 	 'is' => 'west_european', // Icelandic
 221  	 	 'it' => 'west_european', // Italian
 222  	 	 'ja' => 'japanese',
 223  	 	 'ka' => 'unicode', // Georgian
 224  	 	 'kl' => 'west_european', // Greenlandic
 225  	 	 'km' => 'unicode', // Khmer
 226  	 	 'ko' => 'korean',
 227  	 	 'lt' => 'lithuanian',
 228  	 	 'lv' => 'west_european', // Latvian/Lettish
 229  	 	 'nl' => 'west_european', // Dutch
 230  	 	 'no' => 'west_european', // Norwegian
 231  	 	 'nb' => 'west_european', // Norwegian Bokmal
 232  	 	 'nn' => 'west_european', // Norwegian Nynorsk
 233  	 	 'pl' => 'east_european', // Polish
 234  	 	 'pt' => 'west_european', // Portuguese
 235  	 	 'ro' => 'east_european', // Romanian
 236  	 	 'ru' => 'cyrillic', // Russian
 237  	 	 'sk' => 'east_european', // Slovak
 238  	 	 'sl' => 'east_european', // Slovenian
 239  	 	 'sr' => 'cyrillic', // Serbian
 240  	 	 'sv' => 'west_european', // Swedish
 241  	 	 'sq' => 'albanian', // Albanian
 242  	 	 'th' => 'thai',
 243  	 	 'uk' => 'cyrillic', // Ukranian
 244  	 	 'vi' => 'vietnamese',
 245  	 	 'zh' => 'chinese',
 246  	 	 	 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 247  	 	 	 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 248  	 	 'afk'=> 'west_european', // Afrikaans
 249  	 	 'ara' => 'arabic',
 250  	 	 'bgr' => 'cyrillic', // Bulgarian
 251  	 	 'cat' => 'west_european', // Catalan
 252  	 	 'chs' => 'simpl_chinese',
 253  	 	 'cht' => 'trad_chinese',
 254  	 	 'csy' => 'east_european', // Czech
 255  	 	 'dan' => 'west_european', // Danisch
 256  	 	 'deu' => 'west_european', // German
 257  	 	 'dea' => 'west_european', // German (Austrian)
 258  	 	 'des' => 'west_european', // German (Swiss)
 259  	 	 'ena' => 'west_european', // English (Australian)
 260  	 	 'enc' => 'west_european', // English (Canadian)
 261  	 	 'eng' => 'west_european', // English
 262  	 	 'enz' => 'west_european', // English (New Zealand)
 263  	 	 'enu' => 'west_european', // English (United States)
 264  	 	 'euq' => 'west_european', // Basque
 265  	 	 'fos' => 'west_european', // Faroese
 266  	 	 'far' => 'arabic', // Persian
 267  	 	 'fin' => 'west_european', // Finish
 268  	 	 'fra' => 'west_european', // French
 269  	 	 'frb' => 'west_european', // French (Belgian)
 270  	 	 'frc' => 'west_european', // French (Canadian)
 271  	 	 'frs' => 'west_european', // French (Swiss)
 272  	 	 'geo' => 'unicode', // Georgian
 273  	 	 'glg' => 'west_european', // Galician
 274  	 	 'ell' => 'greek',
 275  	 	 'heb' => 'hebrew',
 276  	 	 'hin' => 'unicode', // Hindi
 277  	 	 'hun' => 'east_european', // Hungarian
 278  	 	 'isl' => 'west_euorpean', // Icelandic
 279  	 	 'ita' => 'west_european', // Italian
 280  	 	 'its' => 'west_european', // Italian (Swiss)
 281  	 	 'jpn' => 'japanese',
 282  	 	 'khm' => 'unicode', // Khmer
 283  	 	 'kor' => 'korean',
 284  	 	 'lth' => 'lithuanian',
 285  	 	 'lvi' => 'west_european', // Latvian/Lettish
 286  	 	 'msl' => 'west_european', // Malay
 287  	 	 'nlb' => 'west_european', // Dutch (Belgian)
 288  	 	 'nld' => 'west_european', // Dutch
 289  	 	 'nor' => 'west_european', // Norwegian (bokmal)
 290  	 	 'non' => 'west_european', // Norwegian (nynorsk)
 291  	 	 'plk' => 'east_european', // Polish
 292  	 	 'ptg' => 'west_european', // Portuguese
 293  	 	 'ptb' => 'west_european', // Portuguese (Brazil)
 294  	 	 'rom' => 'east_european', // Romanian
 295  	 	 'rus' => 'cyrillic', // Russian
 296  	 	 'slv' => 'east_european', // Slovenian
 297  	 	 'sky' => 'east_european', // Slovak
 298  	 	 'srl' => 'east_european', // Serbian (Latin)
 299  	 	 'srb' => 'cyrillic', // Serbian (Cyrillic)
 300  	 	 'esp' => 'west_european', // Spanish (trad. sort)
 301  	 	 'esm' => 'west_european', // Spanish (Mexican)
 302  	 	 'esn' => 'west_european', // Spanish (internat. sort)
 303  	 	 'sve' => 'west_european', // Swedish
 304  	 	 'sqi' => 'albanian', // Albanian
 305  	 	 'tha' => 'thai',
 306  	 	 'trk' => 'turkish',
 307  	 	 'ukr' => 'cyrillic', // Ukrainian
 308  	 	 	 // English language names
 309  	 	 'afrikaans' => 'west_european',
 310  	 	 'albanian' => 'albanian',
 311  	 	 'arabic' => 'arabic',
 312  	 	 'basque' => 'west_european',
 313  	 	 'bosnian' => 'east_european',
 314  	 	 'bulgarian' => 'east_european',
 315  	 	 'catalan' => 'west_european',
 316  	 	 'croatian' => 'east_european',
 317  	 	 'czech' => 'east_european',
 318  	 	 'danish' => 'west_european',
 319  	 	 'dutch' => 'west_european',
 320  	 	 'english' => 'west_european',
 321  	 	 'esperanto' => 'unicode',
 322  	 	 'estonian' => 'estonian',
 323  	 	 'faroese' => 'west_european',
 324  	 	 'farsi' => 'arabic',
 325  	 	 'finnish' => 'west_european',
 326  	 	 'french' => 'west_european',
 327  	 	 'galician' => 'west_european',
 328  	 	 'georgian' => 'unicode',
 329  	 	 'german' => 'west_european',
 330  	 	 'greek' => 'greek',
 331  	 	 'greenlandic' => 'west_european',
 332  	 	 'hebrew' => 'hebrew',
 333  	 	 'hindi' => 'unicode',
 334  	 	 'hungarian' => 'east_european',
 335  	 	 'icelandic' => 'west_european',
 336  	 	 'italian' => 'west_european',
 337  	 	 'khmer' => 'unicode',
 338  	 	 'latvian' => 'west_european',
 339  	 	 'lettish' => 'west_european',
 340  	 	 'lithuanian' => 'lithuanian',
 341  	 	 'malay' => 'west_european',
 342  	 	 'norwegian' => 'west_european',
 343  	 	 'persian' => 'arabic',
 344  	 	 'polish' => 'east_european',
 345  	 	 'portuguese' => 'west_european',
 346  	 	 'russian' => 'cyrillic',
 347  	 	 'romanian' => 'east_european',
 348  	 	 'serbian' => 'cyrillic',
 349  	 	 'slovak' => 'east_european',
 350  	 	 'slovenian' => 'east_european',
 351  	 	 'spanish' => 'west_european',
 352  	 	 'svedish' => 'west_european',
 353  	 	 'that' => 'thai',
 354  	 	 'turkish' => 'turkish',
 355  	 	 'ukrainian' => 'cyrillic',
 356  	 );
 357  
 358  	 	 // mapping of language (family) names to charsets on Unix
 359  	 var $script_to_charset_unix = array(
 360  	 	 'west_european' => 'iso-8859-1',
 361  	 	 'estonian' => 'iso-8859-1',
 362  	 	 'east_european' => 'iso-8859-2',
 363  	 	 'baltic' => 'iso-8859-4',
 364  	 	 'cyrillic' => 'iso-8859-5',
 365  	 	 'arabic' => 'iso-8859-6',
 366  	 	 'greek' => 'iso-8859-7',
 367  	 	 'hebrew' => 'iso-8859-8',
 368  	 	 'turkish' => 'iso-8859-9',
 369  	 	 'thai' => 'iso-8859-11', // = TIS-620
 370  	 	 'lithuanian' => 'iso-8859-13',
 371  	 	 'chinese' => 'gb2312', // = euc-cn
 372  	 	 'japanese' => 'euc-jp',
 373  	 	 'korean' => 'euc-kr',
 374  	 	 'simpl_chinese' => 'gb2312',
 375  	 	 'trad_chinese' => 'big5',
 376  	 	 'vietnamese' => '',
 377  	 	 'unicode' => 'utf-8',
 378  	 	 'albanian' => 'utf-8'
 379  	 );
 380  
 381  	 	 // mapping of language (family) names to charsets on Windows
 382  	 var $script_to_charset_windows = array(
 383  	 	 'east_european' => 'windows-1250',
 384  	 	 'cyrillic' => 'windows-1251',
 385  	 	 'west_european' => 'windows-1252',
 386  	 	 'greek' => 'windows-1253',
 387  	 	 'turkish' => 'windows-1254',
 388  	 	 'hebrew' => 'windows-1255',
 389  	 	 'arabic' => 'windows-1256',
 390  	 	 'baltic' => 'windows-1257',
 391  	 	 'estonian' => 'windows-1257',
 392  	 	 'lithuanian' => 'windows-1257',
 393  	 	 'vietnamese' => 'windows-1258',
 394  	 	 'thai' => 'cp874',
 395  	 	 'korean' => 'cp949',
 396  	 	 'chinese' => 'gb2312',
 397  	 	 'japanese' => 'shift_jis',
 398  	 	 'simpl_chinese' => 'gb2312',
 399  	 	 'trad_chinese' => 'big5',
 400  	 	 'albanian' => 'windows-1250',
 401  	 	 'unicode' => 'utf-8'
 402  	 );
 403  
 404  	 	 // mapping of locale names to charsets
 405  	 var $locale_to_charset = array(
 406  	 	 'japanese.euc' => 'euc-jp',
 407  	 	 'ja_jp.ujis' => 'euc-jp',
 408  	 	 'korean.euc' => 'euc-kr',
 409  	 	 'sr@Latn' => 'iso-8859-2',
 410  	 	 'zh_cn' => 'gb2312',
 411  	 	 'zh_hk' => 'big5',
 412  	 	 'zh_tw' => 'big5',
 413  	 );
 414  
 415  	 	 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 416  	 	 // Empty values means "iso-8859-1"
 417  	 var $charSetArray = array(
 418  	 	 'af' => '',
 419  	 	 'ar' => 'iso-8859-6',
 420  	 	 'ba' => 'iso-8859-2',
 421  	 	 'bg' => 'windows-1251',
 422  	 	 'br' => '',
 423  	 	 'ca' => 'iso-8859-15',
 424  	 	 'ch' => 'gb2312',
 425  	 	 'cs' => 'windows-1250',
 426  	 	 'cz' => 'windows-1250',
 427  	 	 'da' => '',
 428  	 	 'de' => '',
 429  	 	 'dk' => '',
 430  	 	 'el' => 'iso-8859-7',
 431  	 	 'eo' => 'utf-8',
 432  	 	 'es' => '',
 433  	 	 'et' => 'iso-8859-4',
 434  	 	 'eu' => '',
 435  	 	 'fa' => 'utf-8',
 436  	 	 'fi' => '',
 437  	 	 'fo' => 'utf-8',
 438  	 	 'fr' => '',
 439  	 	 'fr_CA' => '',
 440  	 	 'ga' => '',
 441  	 	 'ge' => 'utf-8',
 442  	 	 'gl' => '',
 443  	 	 'gr' => 'iso-8859-7',
 444  	 	 'he' => 'utf-8',
 445  	 	 'hi' => 'utf-8',
 446  	 	 'hk' => 'big5',
 447  	 	 'hr' => 'windows-1250',
 448  	 	 'hu' => 'iso-8859-2',
 449  	 	 'is' => 'utf-8',
 450  	 	 'it' => '',
 451  	 	 'ja' => 'shift_jis',
 452  	 	 'jp' => 'shift_jis',
 453  	 	 'ka' => 'utf-8',
 454  	 	 'kl' => 'utf-8',
 455  	 	 'km' => 'utf-8',
 456  	 	 'ko' => 'euc-kr',
 457  	 	 'kr' => 'euc-kr',
 458  	 	 'lt' => 'windows-1257',
 459  	 	 'lv' => 'utf-8',
 460  	 	 'ms' => '',
 461  	 	 'my' => '',
 462  	 	 'nl' => '',
 463  	 	 'no' => '',
 464  	 	 'pl' => 'iso-8859-2',
 465  	 	 'pt' => '',
 466  	 	 'pt_BR' => '',
 467  	 	 'qc' => '',
 468  	 	 'ro' => 'iso-8859-2',
 469  	 	 'ru' => 'windows-1251',
 470  	 	 'se' => '',
 471  	 	 'si' => 'windows-1250',
 472  	 	 'sk' => 'windows-1250',
 473  	 	 'sl' => 'windows-1250',
 474  	 	 'sq' => 'utf-8',
 475  	 	 'sr' => 'utf-8',
 476  	 	 'sv' => '',
 477  	 	 'th' => 'iso-8859-11',
 478  	 	 'tr' => 'iso-8859-9',
 479  	 	 'ua' => 'windows-1251',
 480  	 	 'uk' => 'windows-1251',
 481  	 	 'vi' => 'utf-8',
 482  	 	 'vn' => 'utf-8',
 483  	 	 'zh' => 'big5',
 484  	 );
 485  
 486  	 	 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 487  	 	 // Missing keys means: same as TYPO3
 488  	 	 // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
 489  	 var $isoArray = array(
 490  	 	 'ba' => 'bs',
 491  	 	 'br' => 'pt_BR',
 492  	 	 'ch' => 'zh_CN',
 493  	 	 'cz' => 'cs',
 494  	 	 'dk' => 'da',
 495  	 	 'si' => 'sl',
 496  	 	 'se' => 'sv',
 497  	 	 'gl' => 'kl',
 498  	 	 'gr' => 'el',
 499  	 	 'hk' => 'zh_HK',
 500  	 	 'kr' => 'ko',
 501  	 	 'ua' => 'uk',
 502  	 	 'jp' => 'ja',
 503  	 	 'qc' => 'fr_CA',
 504  	 	 'vn' => 'vi',
 505  	 	 'ge' => 'ka',
 506  	 	 'ga' => 'gl',
 507  	 );
 508  
 509  	 /**
 510  	  * Default constructor.
 511  	  */
 512  	public function __construct() {
 513  	 	 $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
 514  	 }
 515  
 516  	 /**
 517  	  * Normalize - changes input character set to lowercase letters.
 518  	  *
 519  	  * @param	 string	 	 Input charset
 520  	  * @return	 string	 	 Normalized charset
 521  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
 522  	  */
 523  	function parse_charset($charset) {
 524  	 	 $charset = trim(strtolower($charset));
 525  	 	 if (isset($this->synonyms[$charset])) {
 526  	 	 	 $charset = $this->synonyms[$charset];
 527  	 	 }
 528  
 529  	 	 return $charset;
 530  	 }
 531  
 532  	 /**
 533  	  * Get the charset of a locale.
 534  	  *
 535  	  * ln	 	 	 language
 536  	  * ln_CN	 	  language / country
 537  	  * ln_CN.cs	   language / country / charset
 538  	  * ln_CN.cs@mod  language / country / charset / modifier
 539  	  *
 540  	  * @param	 string	 	 Locale string
 541  	  * @return	 string	 	 Charset resolved for locale string
 542  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
 543  	  */
 544  	function get_locale_charset($locale) {
 545  	 	 $locale = strtolower($locale);
 546  
 547  	 	 	 // exact locale specific charset?
 548  	 	 if (isset($this->locale_to_charset[$locale])) {
 549  	 	 	 return $this->locale_to_charset[$locale];
 550  	 	 }
 551  
 552  	 	 	 // get modifier
 553  	 	 list($locale, $modifier) = explode('@', $locale);
 554  
 555  	 	 	 // locale contains charset: use it
 556  	 	 list($locale, $charset) = explode('.', $locale);
 557  	 	 if ($charset) {
 558  	 	 	 return $this->parse_charset($charset);
 559  	 	 }
 560  
 561  	 	 	 // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 562  	 	 if ($modifier == 'euro') {
 563  	 	 	 return 'iso-8859-15';
 564  	 	 }
 565  
 566  	 	 	 // get language
 567  	 	 list($language, $country) = explode('_', $locale);
 568  	 	 if (isset($this->lang_to_script[$language])) {
 569  	 	 	 $script = $this->lang_to_script[$language];
 570  	 	 }
 571  
 572  	 	 if (TYPO3_OS == 'WIN') {
 573  	 	 	 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
 574  	 	 } else {
 575  	 	 	 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
 576  	 	 }
 577  
 578  	 	 return $cs;
 579  	 }
 580  
 581  
 582  	 /********************************************
 583  	  *
 584  	  * Charset Conversion functions
 585  	  *
 586  	  ********************************************/
 587  
 588  	 /**
 589  	  * Convert from one charset to another charset.
 590  	  *
 591  	  * @param	 string	 	 Input string
 592  	  * @param	 string	 	 From charset (the current charset of the string)
 593  	  * @param	 string	 	 To charset (the output charset wanted)
 594  	  * @param	 boolean	 	 If set, then characters that are not available in the destination character set will be encoded as numeric entities
 595  	  * @return	 string	 	 Converted string
 596  	  * @see convArray()
 597  	  */
 598  	function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
 599  	 	 if ($fromCS == $toCS) {
 600  	 	 	 return $str;
 601  	 	 }
 602  
 603  	 	 	 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 604  	 	 if ($toCS == 'utf-8' || !$useEntityForNoChar) {
 605  	 	 	 switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
 606  	 	 	 	 case 'mbstring':
 607  	 	 	 	 	 $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
 608  	 	 	 	 	 if (FALSE !== $conv_str) {
 609  	 	 	 	 	 	 return $conv_str;
 610  	 	 	 	 	 } // returns FALSE for unsupported charsets
 611  	 	 	 	 	 break;
 612  
 613  	 	 	 	 case 'iconv':
 614  	 	 	 	 	 $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
 615  	 	 	 	 	 if (FALSE !== $conv_str) {
 616  	 	 	 	 	 	 return $conv_str;
 617  	 	 	 	 	 }
 618  	 	 	 	 	 break;
 619  
 620  	 	 	 	 case 'recode':
 621  	 	 	 	 	 $conv_str = recode_string($fromCS . '..' . $toCS, $str);
 622  	 	 	 	 	 if (FALSE !== $conv_str) {
 623  	 	 	 	 	 	 return $conv_str;
 624  	 	 	 	 	 }
 625  	 	 	 	 	 break;
 626  	 	 	 }
 627  	 	 	 // fallback to TYPO3 conversion
 628  	 	 }
 629  
 630  	 	 if ($fromCS != 'utf-8') {
 631  	 	 	 $str = $this->utf8_encode($str, $fromCS);
 632  	 	 }
 633  	 	 if ($toCS != 'utf-8') {
 634  	 	 	 $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
 635  	 	 }
 636  	 	 return $str;
 637  	 }
 638  
 639  	 /**
 640  	  * Convert all elements in ARRAY with type string from one charset to another charset.
 641  	  * NOTICE: Array is passed by reference!
 642  	  *
 643  	  * @param	 string	 	 Input array, possibly multidimensional
 644  	  * @param	 string	 	 From charset (the current charset of the string)
 645  	  * @param	 string	 	 To charset (the output charset wanted)
 646  	  * @param	 boolean	 	 If set, then characters that are not available in the destination character set will be encoded as numeric entities
 647  	  * @return	 void
 648  	  * @see conv()
 649  	  */
 650  	function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
 651  	 	 foreach ($array as $key => $value) {
 652  	 	 	 if (is_array($array[$key])) {
 653  	 	 	 	 $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 654  	 	 	 } elseif (is_string($array[$key])) {
 655  	 	 	 	 $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 656  	 	 	 }
 657  	 	 }
 658  	 }
 659  
 660  	 /**
 661  	  * Converts $str from $charset to UTF-8
 662  	  *
 663  	  * @param	 string	 	 String in local charset to convert to UTF-8
 664  	  * @param	 string	 	 Charset, lowercase. Must be found in csconvtbl/ folder.
 665  	  * @return	 string	 	 Output string, converted to UTF-8
 666  	  */
 667  	function utf8_encode($str, $charset) {
 668  
 669  	 	 if ($charset === 'utf-8') {
 670  	 	 	 return $str;
 671  	 	 }
 672  
 673  	 	 	 // Charset is case-insensitive.
 674  	 	 if ($this->initCharset($charset)) { // Parse conv. table if not already...
 675  	 	 	 $strLen = strlen($str);
 676  	 	 	 $outStr = '';
 677  
 678  	 	 	 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
 679  	 	 	 	 $chr = substr($str, $a, 1);
 680  	 	 	 	 $ord = ord($chr);
 681  	 	 	 	 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
 682  	 	 	 	 	 $ord2 = ord($str[$a + 1]);
 683  	 	 	 	 	 $ord = $ord << 8 | $ord2; // assume big endian
 684  
 685  	 	 	 	 	 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 686  	 	 	 	 	 	 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 687  	 	 	 	 	 } else {
 688  	 	 	 	 	 	 $outStr .= chr($this->noCharByteVal);
 689  	 	 	 	 	 } // No char exists
 690  	 	 	 	 	 $a++;
 691  	 	 	 	 } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
 692  	 	 	 	 	 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 693  	 	 	 	 	 	 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
 694  	 	 	 	 	 	 	 $a++;
 695  	 	 	 	 	 	 	 $ord2 = ord(substr($str, $a, 1));
 696  	 	 	 	 	 	 	 $ord = $ord * 256 + $ord2;
 697  	 	 	 	 	 	 }
 698  	 	 	 	 	 }
 699  
 700  	 	 	 	 	 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 701  	 	 	 	 	 	 $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 702  	 	 	 	 	 } else {
 703  	 	 	 	 	 	 $outStr .= chr($this->noCharByteVal);
 704  	 	 	 	 	 } // No char exists
 705  	 	 	 	 } else {
 706  	 	 	 	 	 $outStr .= $chr;
 707  	 	 	 	 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 708  	 	 	 }
 709  	 	 	 return $outStr;
 710  	 	 }
 711  	 }
 712  
 713  	 /**
 714  	  * Converts $str from UTF-8 to $charset
 715  	  *
 716  	  * @param	 string	 	 String in UTF-8 to convert to local charset
 717  	  * @param	 string	 	 Charset, lowercase. Must be found in csconvtbl/ folder.
 718  	  * @param	 boolean	 	 If set, then characters that are not available in the destination character set will be encoded as numeric entities
 719  	  * @return	 string	 	 Output string, converted to local charset
 720  	  */
 721  	function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
 722  
 723  	 	 if ($charset === 'utf-8') {
 724  	 	 	 return $str;
 725  	 	 }
 726  
 727  	 	 	 // Charset is case-insensitive.
 728  	 	 if ($this->initCharset($charset)) { // Parse conv. table if not already...
 729  	 	 	 $strLen = strlen($str);
 730  	 	 	 $outStr = '';
 731  	 	 	 $buf = '';
 732  	 	 	 for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
 733  	 	 	 	 $chr = substr($str, $a, 1);
 734  	 	 	 	 $ord = ord($chr);
 735  	 	 	 	 if ($ord > 127) { // This means multibyte! (first byte!)
 736  	 	 	 	 	 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 737  
 738  	 	 	 	 	 	 $buf = $chr; // Add first byte
 739  	 	 	 	 	 	 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 740  	 	 	 	 	 	 	 $ord = $ord << 1; // Shift it left and ...
 741  	 	 	 	 	 	 	 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 742  	 	 	 	 	 	 	 	 $a++; // Increase pointer...
 743  	 	 	 	 	 	 	 	 $buf .= substr($str, $a, 1); // ... and add the next char.
 744  	 	 	 	 	 	 	 } else {
 745  	 	 	 	 	 	 	 	 break;
 746  	 	 	 	 	 	 	 }
 747  	 	 	 	 	 	 }
 748  
 749  	 	 	 	 	 	 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
 750  	 	 	 	 	 	 	 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
 751  	 	 	 	 	 	 	 if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 752  	 	 	 	 	 	 	 	 $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
 753  	 	 	 	 	 	 	 } else {
 754  	 	 	 	 	 	 	 	 $outStr .= chr($mByte);
 755  	 	 	 	 	 	 	 }
 756  	 	 	 	 	 	 } elseif ($useEntityForNoChar) { // Create num entity:
 757  	 	 	 	 	 	 	 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 758  	 	 	 	 	 	 } else {
 759  	 	 	 	 	 	 	 $outStr .= chr($this->noCharByteVal);
 760  	 	 	 	 	 	 } // No char exists
 761  	 	 	 	 	 } else {
 762  	 	 	 	 	 	 $outStr .= chr($this->noCharByteVal);
 763  	 	 	 	 	 } // No char exists (MIDDLE of MB sequence!)
 764  	 	 	 	 } else {
 765  	 	 	 	 	 $outStr .= $chr;
 766  	 	 	 	 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 767  	 	 	 }
 768  	 	 	 return $outStr;
 769  	 	 }
 770  	 }
 771  
 772  	 /**
 773  	  * Converts all chars > 127 to numeric entities.
 774  	  *
 775  	  * @param	 string	 	 Input string
 776  	  * @return	 string	 	 Output string
 777  	  */
 778  	function utf8_to_entities($str) {
 779  	 	 $strLen = strlen($str);
 780  	 	 $outStr = '';
 781  	 	 $buf = '';
 782  	 	 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 783  	 	 	 $chr = substr($str, $a, 1);
 784  	 	 	 $ord = ord($chr);
 785  	 	 	 if ($ord > 127) { // This means multibyte! (first byte!)
 786  	 	 	 	 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 787  	 	 	 	 	 $buf = $chr; // Add first byte
 788  	 	 	 	 	 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 789  	 	 	 	 	 	 $ord = $ord << 1; // Shift it left and ...
 790  	 	 	 	 	 	 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 791  	 	 	 	 	 	 	 $a++; // Increase pointer...
 792  	 	 	 	 	 	 	 $buf .= substr($str, $a, 1); // ... and add the next char.
 793  	 	 	 	 	 	 } else {
 794  	 	 	 	 	 	 	 break;
 795  	 	 	 	 	 	 }
 796  	 	 	 	 	 }
 797  
 798  	 	 	 	 	 $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 799  	 	 	 	 } else {
 800  	 	 	 	 	 $outStr .= chr($this->noCharByteVal);
 801  	 	 	 	 } // No char exists (MIDDLE of MB sequence!)
 802  	 	 	 } else {
 803  	 	 	 	 $outStr .= $chr;
 804  	 	 	 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 805  	 	 }
 806  
 807  	 	 return $outStr;
 808  	 }
 809  
 810  	 /**
 811  	  * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 812  	  *
 813  	  * @param	 string	 	 Input string, UTF-8
 814  	  * @param	 boolean	 	 If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 815  	  * @return	 string	 	 Output string
 816  	  */
 817  	function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
 818  	 	 // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
 819  	 	 // see http://php.net/manual/en/function.get-html-translation-table.php
 820  	 	 $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
 821  
 822  	 	 if ($alsoStdHtmlEnt) {
 823  	 	 	 if ($applyPhpCompatibilityFix === TRUE) {
 824  	 	 	 	 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
 825  	 	 	 } else {
 826  	 	 	 	 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
 827  	 	 	 }
 828  	 	 }
 829  
 830  	 	 $token = md5(microtime());
 831  	 	 $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '$2}' . $token, $str));
 832  	 	 foreach ($parts as $k => $v) {
 833  	 	 	 	 // only take every second element
 834  	 	 	 if ($k % 2 === 0) {
 835  	 	 	 	 continue;
 836  	 	 	 }
 837  
 838  	 	 	 $position = 0;
 839  	 	 	 if (substr($v, $position, 1) == '#') { // Dec or hex entities:
 840  	 	 	 	 $position++;
 841  	 	 	 	 if (substr($v, $position, 1) == 'x') {
 842  	 	 	 	 	 $v = hexdec(substr($v, ++$position));
 843  	 	 	 	 } else {
 844  	 	 	 	 	 $v = substr($v, $position);
 845  	 	 	 	 }
 846  	 	 	 	 $parts[$k] = $this->UnumberToChar($v);
 847  	 	 	 } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
 848  	 	 	 	 $v = $trans_tbl['&' . $v . ';'];
 849  	 	 	 	 if ($applyPhpCompatibilityFix === TRUE) {
 850  	 	 	 	 	 $v = $this->utf8_encode($v, 'iso-8859-1');
 851  	 	 	 	 }
 852  	 	 	 	 $parts[$k] = $v;
 853  	 	 	 } else { // No conversion:
 854  	 	 	 	 $parts[$k] = '&' . $v . ';';
 855  	 	 	 }
 856  	 	 }
 857  
 858  	 	 return implode('', $parts);
 859  	 }
 860  
 861  	 /**
 862  	  * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 863  	  *
 864  	  * @param	 string	 	 Input string, UTF-8
 865  	  * @param	 boolean	 	 If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 866  	  * @param	 boolean	 	 If set, then instead of integer numbers the real UTF-8 char is returned.
 867  	  * @return	 array	 	 Output array with the char numbers
 868  	  */
 869  	function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
 870  	 	 	 // If entities must be registered as well...:
 871  	 	 if ($convEntities) {
 872  	 	 	 $str = $this->entities_to_utf8($str, 1);
 873  	 	 }
 874  	 	 	 // Do conversion:
 875  	 	 $strLen = strlen($str);
 876  	 	 $outArr = array();
 877  	 	 $buf = '';
 878  	 	 for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 879  	 	 	 $chr = substr($str, $a, 1);
 880  	 	 	 $ord = ord($chr);
 881  	 	 	 if ($ord > 127) { // This means multibyte! (first byte!)
 882  	 	 	 	 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 883  	 	 	 	 	 $buf = $chr; // Add first byte
 884  	 	 	 	 	 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 885  	 	 	 	 	 	 $ord = $ord << 1; // Shift it left and ...
 886  	 	 	 	 	 	 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 887  	 	 	 	 	 	 	 $a++; // Increase pointer...
 888  	 	 	 	 	 	 	 $buf .= substr($str, $a, 1); // ... and add the next char.
 889  	 	 	 	 	 	 } else {
 890  	 	 	 	 	 	 	 break;
 891  	 	 	 	 	 	 }
 892  	 	 	 	 	 }
 893  
 894  	 	 	 	 	 $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
 895  	 	 	 	 } else {
 896  	 	 	 	 	 $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
 897  	 	 	 	 } // No char exists (MIDDLE of MB sequence!)
 898  	 	 	 } else {
 899  	 	 	 	 $outArr[] = $retChar ? chr($ord) : $ord;
 900  	 	 	 } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 901  	 	 }
 902  
 903  	 	 return $outArr;
 904  	 }
 905  
 906  	 /**
 907  	  * Converts a UNICODE number to a UTF-8 multibyte character
 908  	  * Algorithm based on script found at From: http://czyborra.com/utf/
 909  	  * Unit-tested by Kasper
 910  	  *
 911  	  * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 912  	  *
 913  	  *  bytes | bits | representation
 914  	  *	   1 |	 7 | 0vvvvvvv
 915  	  *	   2 |   11 | 110vvvvv 10vvvvvv
 916  	  *	   3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 917  	  *	   4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 918  	  *	   5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 919  	  *	   6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 920  	  *
 921  	  * @param	 integer	 	 UNICODE integer
 922  	  * @return	 string	 	 UTF-8 multibyte character string
 923  	  * @see utf8CharToUnumber()
 924  	  */
 925  	function UnumberToChar($cbyte) {
 926  	 	 $str = '';
 927  
 928  	 	 if ($cbyte < 0x80) {
 929  	 	 	 $str .= chr($cbyte);
 930  	 	 } else {
 931  	 	 	 if ($cbyte < 0x800) {
 932  	 	 	 	 $str .= chr(0xC0 | ($cbyte >> 6));
 933  	 	 	 	 $str .= chr(0x80 | ($cbyte & 0x3F));
 934  	 	 	 } else {
 935  	 	 	 	 if ($cbyte < 0x10000) {
 936  	 	 	 	 	 $str .= chr(0xE0 | ($cbyte >> 12));
 937  	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 938  	 	 	 	 	 $str .= chr(0x80 | ($cbyte & 0x3F));
 939  	 	 	 	 } else {
 940  	 	 	 	 	 if ($cbyte < 0x200000) {
 941  	 	 	 	 	 	 $str .= chr(0xF0 | ($cbyte >> 18));
 942  	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 943  	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 944  	 	 	 	 	 	 $str .= chr(0x80 | ($cbyte & 0x3F));
 945  	 	 	 	 	 } else {
 946  	 	 	 	 	 	 if ($cbyte < 0x4000000) {
 947  	 	 	 	 	 	 	 $str .= chr(0xF8 | ($cbyte >> 24));
 948  	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 949  	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 950  	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 951  	 	 	 	 	 	 	 $str .= chr(0x80 | ($cbyte & 0x3F));
 952  	 	 	 	 	 	 } else {
 953  	 	 	 	 	 	 	 if ($cbyte < 0x80000000) {
 954  	 	 	 	 	 	 	 	 $str .= chr(0xFC | ($cbyte >> 30));
 955  	 	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
 956  	 	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 957  	 	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 958  	 	 	 	 	 	 	 	 $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 959  	 	 	 	 	 	 	 	 $str .= chr(0x80 | ($cbyte & 0x3F));
 960  	 	 	 	 	 	 	 } else { // Cannot express a 32-bit character in UTF-8
 961  	 	 	 	 	 	 	 	 $str .= chr($this->noCharByteVal);
 962  	 	 	 	 	 	 	 }
 963  	 	 	 	 	 	 }
 964  	 	 	 	 	 }
 965  	 	 	 	 }
 966  	 	 	 }
 967  	 	 }
 968  	 	 return $str;
 969  	 }
 970  
 971  	 /**
 972  	  * Converts a UTF-8 Multibyte character to a UNICODE number
 973  	  * Unit-tested by Kasper
 974  	  *
 975  	  * @param	 string	 	 UTF-8 multibyte character string
 976  	  * @param	 boolean	 	 If set, then a hex. number is returned.
 977  	  * @return	 integer	 	 UNICODE integer
 978  	  * @see UnumberToChar()
 979  	  */
 980  	function utf8CharToUnumber($str, $hex = 0) {
 981  	 	 $ord = ord(substr($str, 0, 1)); // First char
 982  
 983  	 	 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
 984  	 	 	 $binBuf = '';
 985  	 	 	 for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 986  	 	 	 	 $ord = $ord << 1; // Shift it left and ...
 987  	 	 	 	 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 988  	 	 	 	 	 $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
 989  	 	 	 	 } else {
 990  	 	 	 	 	 break;
 991  	 	 	 	 }
 992  	 	 	 }
 993  	 	 	 $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
 994  
 995  	 	 	 $int = bindec($binBuf);
 996  	 	 } else {
 997  	 	 	 $int = $ord;
 998  	 	 }
 999  
1000  	 	 return $hex ? 'x' . dechex($int) : $int;
1001  	 }
1002  
1003  
1004  	 /********************************************
1005  	  *
1006  	  * Init functions
1007  	  *
1008  	  ********************************************/
1009  
1010  	 /**
1011  	  * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1012  	  * This function is automatically called by the conversion functions
1013  	  *
1014  	  * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1015  	  *
1016  	  * @param	 string	 	 The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1017  	  * @return	 integer	 	 Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1018  	  * @access private
1019  	  */
1020  	function initCharset($charset) {
1021  	 	 	 // Only process if the charset is not yet loaded:
1022  	 	 if (!is_array($this->parsedCharsets[$charset])) {
1023  
1024  	 	 	 	 // Conversion table filename:
1025  	 	 	 $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1026  
1027  	 	 	 	 // If the conversion table is found:
1028  	 	 	 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1029  	 	 	 	 	 // Cache file for charsets:
1030  	 	 	 	 	 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1031  	 	 	 	 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1032  	 	 	 	 if ($cacheFile && @is_file($cacheFile)) {
1033  	 	 	 	 	 $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1034  	 	 	 	 } else {
1035  	 	 	 	 	 	 // Parse conversion table into lines:
1036  	 	 	 	 	 $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1037  	 	 	 	 	 	 // Initialize the internal variable holding the conv. table:
1038  	 	 	 	 	 $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1039  	 	 	 	 	 	 // traverse the lines:
1040  	 	 	 	 	 $detectedType = '';
1041  	 	 	 	 	 foreach ($lines as $value) {
1042  	 	 	 	 	 	 if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1043  
1044  	 	 	 	 	 	 	 	 // Detect type if not done yet: (Done on first real line)
1045  	 	 	 	 	 	 	 	 // The "whitespaced" type is on the syntax 	 "0x0A	 0x000A	 #LINE FEED" 	 while 	 "ms-token" is like 	 	 "B9 = U+00B9 : SUPERSCRIPT ONE"
1046  	 	 	 	 	 	 	 if (!$detectedType) {
1047  	 	 	 	 	 	 	 	 $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1048  	 	 	 	 	 	 	 }
1049  
1050  	 	 	 	 	 	 	 if ($detectedType == 'ms-token') {
1051  	 	 	 	 	 	 	 	 list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1052  	 	 	 	 	 	 	 } elseif ($detectedType == 'whitespaced') {
1053  	 	 	 	 	 	 	 	 $regA = array();
1054  	 	 	 	 	 	 	 	 preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1055  	 	 	 	 	 	 	 	 $hexbyte = $regA[1];
1056  	 	 	 	 	 	 	 	 $utf8 = 'U+' . $regA[2];
1057  	 	 	 	 	 	 	 }
1058  	 	 	 	 	 	 	 $decval = hexdec(trim($hexbyte));
1059  	 	 	 	 	 	 	 if ($decval > 127) {
1060  	 	 	 	 	 	 	 	 $utf8decval = hexdec(substr(trim($utf8), 2));
1061  	 	 	 	 	 	 	 	 $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1062  	 	 	 	 	 	 	 	 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1063  	 	 	 	 	 	 	 }
1064  	 	 	 	 	 	 }
1065  	 	 	 	 	 }
1066  	 	 	 	 	 if ($cacheFile) {
1067  	 	 	 	 	 	 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1068  	 	 	 	 	 }
1069  	 	 	 	 }
1070  	 	 	 	 return 2;
1071  	 	 	 } else {
1072  	 	 	 	 return FALSE;
1073  	 	 	 }
1074  	 	 } else {
1075  	 	 	 return 1;
1076  	 	 }
1077  	 }
1078  
1079  	 /**
1080  	  * This function initializes all UTF-8 character data tables.
1081  	  *
1082  	  * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1083  	  *
1084  	  * @param	 string	 	 Mode ("case", "ascii", ...)
1085  	  * @return	 integer	 	 Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1086  	  * @access private
1087  	  */
1088  	function initUnicodeData($mode = NULL) {
1089  	 	 	 // cache files
1090  	 	 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1091  	 	 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1092  
1093  	 	 	 // Only process if the tables are not yet loaded
1094  	 	 switch ($mode) {
1095  	 	 	 case 'case':
1096  	 	 	 	 if (is_array($this->caseFolding['utf-8'])) {
1097  	 	 	 	 	 return 1;
1098  	 	 	 	 }
1099  
1100  	 	 	 	 	 // Use cached version if possible
1101  	 	 	 	 if ($cacheFileCase && @is_file($cacheFileCase)) {
1102  	 	 	 	 	 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1103  	 	 	 	 	 return 2;
1104  	 	 	 	 }
1105  	 	 	 	 break;
1106  
1107  	 	 	 case 'ascii':
1108  	 	 	 	 if (is_array($this->toASCII['utf-8'])) {
1109  	 	 	 	 	 return 1;
1110  	 	 	 	 }
1111  
1112  	 	 	 	 	 // Use cached version if possible
1113  	 	 	 	 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1114  	 	 	 	 	 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1115  	 	 	 	 	 return 2;
1116  	 	 	 	 }
1117  	 	 	 	 break;
1118  	 	 }
1119  
1120  	 	 	 // process main Unicode data file
1121  	 	 $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1122  	 	 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1123  	 	 	 return FALSE;
1124  	 	 }
1125  
1126  	 	 $fh = fopen($unicodeDataFile, 'rb');
1127  	 	 if (!$fh) {
1128  	 	 	 return FALSE;
1129  	 	 }
1130  
1131  	 	 	 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1132  	 	 	 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1133  	 	 $this->caseFolding['utf-8'] = array();
1134  	 	 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1135  	 	 $utf8CaseFolding['toUpper'] = array();
1136  	 	 $utf8CaseFolding['toLower'] = array();
1137  	 	 $utf8CaseFolding['toTitle'] = array();
1138  
1139  	 	 $decomposition = array(); // array of temp. decompositions
1140  	 	 $mark = array(); // array of chars that are marks (eg. composing accents)
1141  	 	 $number = array(); // array of chars that are numbers (eg. digits)
1142  	 	 $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1143  
1144  	 	 while (!feof($fh)) {
1145  	 	 	 $line = fgets($fh, 4096);
1146  	 	 	 	 // has a lot of info
1147  	 	 	 list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1148  
1149  	 	 	 $ord = hexdec($char);
1150  	 	 	 if ($ord > 0xFFFF) {
1151  	 	 	 	 break;
1152  	 	 	 } // only process the BMP
1153  
1154  	 	 	 $utf8_char = $this->UnumberToChar($ord);
1155  
1156  	 	 	 if ($upper) {
1157  	 	 	 	 $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1158  	 	 	 }
1159  	 	 	 if ($lower) {
1160  	 	 	 	 $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1161  	 	 	 }
1162  	 	 	 	 // store "title" only when different from "upper" (only a few)
1163  	 	 	 if ($title && $title != $upper) {
1164  	 	 	 	 $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1165  	 	 	 }
1166  
1167  	 	 	 switch ($cat[0]) {
1168  	 	 	 	 case 'M': // mark (accent, umlaut, ...)
1169  	 	 	 	 	 $mark["U+$char"] = 1;
1170  	 	 	 	 	 break;
1171  
1172  	 	 	 	 case 'N': // numeric value
1173  	 	 	 	 	 if ($ord > 0x80 && $num != '') {
1174  	 	 	 	 	 	 $number["U+$char"] = $num;
1175  	 	 	 	 	 }
1176  	 	 	 }
1177  
1178  	 	 	 	 // accented Latin letters without "official" decomposition
1179  	 	 	 $match = array();
1180  	 	 	 if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1181  	 	 	 	 $c = ord($match[2]);
1182  	 	 	 	 if ($match[1] == 'SMALL') {
1183  	 	 	 	 	 $c += 32;
1184  	 	 	 	 }
1185  
1186  	 	 	 	 $decomposition["U+$char"] = array(dechex($c));
1187  	 	 	 	 continue;
1188  	 	 	 }
1189  
1190  	 	 	 $match = array();
1191  	 	 	 if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1192  	 	 	 	 switch ($match[1]) {
1193  	 	 	 	 	 case '<circle>': // add parenthesis as circle replacement, eg (1)
1194  	 	 	 	 	 	 $match[2] = '0028 ' . $match[2] . ' 0029';
1195  	 	 	 	 	 	 break;
1196  
1197  	 	 	 	 	 case '<square>': // add square brackets as square replacement, eg [1]
1198  	 	 	 	 	 	 $match[2] = '005B ' . $match[2] . ' 005D';
1199  	 	 	 	 	 	 break;
1200  
1201  	 	 	 	 	 case '<compat>': // ignore multi char decompositions that start with a space
1202  	 	 	 	 	 	 if (preg_match('/^0020 /', $match[2])) {
1203  	 	 	 	 	 	 	 continue 2;
1204  	 	 	 	 	 	 }
1205  	 	 	 	 	 	 break;
1206  
1207  	 	 	 	 	 	 // ignore Arabic and vertical layout presentation decomposition
1208  	 	 	 	 	 case '<initial>':
1209  	 	 	 	 	 case '<medial>':
1210  	 	 	 	 	 case '<final>':
1211  	 	 	 	 	 case '<isolated>':
1212  	 	 	 	 	 case '<vertical>':
1213  	 	 	 	 	 	 continue 2;
1214  	 	 	 	 }
1215  	 	 	 	 $decomposition["U+$char"] = explode(' ', $match[2]);
1216  	 	 	 }
1217  	 	 }
1218  	 	 fclose($fh);
1219  
1220  	 	 	 // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1221  	 	 $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1222  	 	 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1223  	 	 	 $fh = fopen($specialCasingFile, 'rb');
1224  	 	 	 if ($fh) {
1225  	 	 	 	 while (!feof($fh)) {
1226  	 	 	 	 	 $line = fgets($fh, 4096);
1227  	 	 	 	 	 if ($line[0] != '#' && trim($line) != '') {
1228  
1229  	 	 	 	 	 	 list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1230  	 	 	 	 	 	 if ($cond == '' || $cond[0] == '#') {
1231  	 	 	 	 	 	 	 $utf8_char = $this->UnumberToChar(hexdec($char));
1232  	 	 	 	 	 	 	 if ($char != $lower) {
1233  	 	 	 	 	 	 	 	 $arr = explode(' ', $lower);
1234  	 	 	 	 	 	 	 	 for ($i = 0; isset($arr[$i]); $i++) {
1235  	 	 	 	 	 	 	 	 	 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1236  	 	 	 	 	 	 	 	 }
1237  	 	 	 	 	 	 	 	 $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1238  	 	 	 	 	 	 	 }
1239  	 	 	 	 	 	 	 if ($char != $title && $title != $upper) {
1240  	 	 	 	 	 	 	 	 $arr = explode(' ', $title);
1241  	 	 	 	 	 	 	 	 for ($i = 0; isset($arr[$i]); $i++) {
1242  	 	 	 	 	 	 	 	 	 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1243  	 	 	 	 	 	 	 	 }
1244  	 	 	 	 	 	 	 	 $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1245  	 	 	 	 	 	 	 }
1246  	 	 	 	 	 	 	 if ($char != $upper) {
1247  	 	 	 	 	 	 	 	 $arr = explode(' ', $upper);
1248  	 	 	 	 	 	 	 	 for ($i = 0; isset($arr[$i]); $i++) {
1249  	 	 	 	 	 	 	 	 	 $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1250  	 	 	 	 	 	 	 	 }
1251  	 	 	 	 	 	 	 	 $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1252  	 	 	 	 	 	 	 }
1253  	 	 	 	 	 	 }
1254  	 	 	 	 	 }
1255  	 	 	 	 }
1256  	 	 	 	 fclose($fh);
1257  	 	 	 }
1258  	 	 }
1259  
1260  	 	 	 // process custom decompositions
1261  	 	 $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1262  	 	 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1263  	 	 	 $fh = fopen($customTranslitFile, 'rb');
1264  	 	 	 if ($fh) {
1265  	 	 	 	 while (!feof($fh)) {
1266  	 	 	 	 	 $line = fgets($fh, 4096);
1267  	 	 	 	 	 if ($line[0] != '#' && trim($line) != '') {
1268  	 	 	 	 	 	 list($char, $translit) = t3lib_div::trimExplode(';', $line);
1269  	 	 	 	 	 	 if (!$translit) {
1270  	 	 	 	 	 	 	 $omit["U+$char"] = 1;
1271  	 	 	 	 	 	 }
1272  	 	 	 	 	 	 $decomposition["U+$char"] = explode(' ', $translit);
1273  
1274  	 	 	 	 	 }
1275  	 	 	 	 }
1276  	 	 	 	 fclose($fh);
1277  	 	 	 }
1278  	 	 }
1279  
1280  	 	 	 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1281  	 	 foreach ($decomposition as $from => $to) {
1282  	 	 	 $code_decomp = array();
1283  
1284  	 	 	 while ($code_value = array_shift($to)) {
1285  	 	 	 	 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1286  	 	 	 	 	 foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1287  	 	 	 	 	 	 array_unshift($to, $cv);
1288  	 	 	 	 	 }
1289  	 	 	 	 } elseif (!isset($mark["U+$code_value"])) { // remove mark
1290  	 	 	 	 	 array_push($code_decomp, $code_value);
1291  	 	 	 	 }
1292  	 	 	 }
1293  	 	 	 if (count($code_decomp) || isset($omit[$from])) {
1294  	 	 	 	 $decomposition[$from] = $code_decomp;
1295  	 	 	 } else {
1296  	 	 	 	 unset($decomposition[$from]);
1297  	 	 	 }
1298  	 	 }
1299  
1300  	 	 	 // create ascii only mapping
1301  	 	 $this->toASCII['utf-8'] = array();
1302  	 	 $ascii =& $this->toASCII['utf-8'];
1303  
1304  	 	 foreach ($decomposition as $from => $to) {
1305  	 	 	 $code_decomp = array();
1306  	 	 	 while ($code_value = array_shift($to)) {
1307  	 	 	 	 $ord = hexdec($code_value);
1308  	 	 	 	 if ($ord > 127) {
1309  	 	 	 	 	 continue 2;
1310  	 	 	 	 } // skip decompositions containing non-ASCII chars
1311  	 	 	 	 else
1312  	 	 	 	 {
1313  	 	 	 	 	 array_push($code_decomp, chr($ord));
1314  	 	 	 	 }
1315  	 	 	 }
1316  	 	 	 $ascii[$this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)))] = join('', $code_decomp);
1317  	 	 }
1318  
1319  	 	 	 // add numeric decompositions
1320  	 	 foreach ($number as $from => $to) {
1321  	 	 	 $utf8_char = $this->UnumberToChar(hexdec(str_replace('U+', '0x', $from)));
1322  	 	 	 if (!isset($ascii[$utf8_char])) {
1323  	 	 	 	 $ascii[$utf8_char] = $to;
1324  	 	 	 }
1325  	 	 }
1326  
1327  	 	 if ($cacheFileCase) {
1328  	 	 	 t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1329  	 	 }
1330  
1331  	 	 if ($cacheFileASCII) {
1332  	 	 	 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1333  	 	 }
1334  
1335  	 	 return 3;
1336  	 }
1337  
1338  	 /**
1339  	  * This function initializes the folding table for a charset other than UTF-8.
1340  	  * This function is automatically called by the case folding functions.
1341  	  *
1342  	  * @param	 string	 	 Charset for which to initialize case folding.
1343  	  * @return	 integer	 	 Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1344  	  * @access private
1345  	  */
1346  	function initCaseFolding($charset) {
1347  	 	 	 // Only process if the case table is not yet loaded:
1348  	 	 if (is_array($this->caseFolding[$charset])) {
1349  	 	 	 return 1;
1350  	 	 }
1351  
1352  	 	 	 // Use cached version if possible
1353  	 	 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1354  	 	 if ($cacheFile && @is_file($cacheFile)) {
1355  	 	 	 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1356  	 	 	 return 2;
1357  	 	 }
1358  
1359  	 	 	 // init UTF-8 conversion for this charset
1360  	 	 if (!$this->initCharset($charset)) {
1361  	 	 	 return FALSE;
1362  	 	 }
1363  
1364  	 	 	 // UTF-8 case folding is used as the base conversion table
1365  	 	 if (!$this->initUnicodeData('case')) {
1366  	 	 	 return FALSE;
1367  	 	 }
1368  
1369  	 	 $nochar = chr($this->noCharByteVal);
1370  	 	 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1371  	 	 	 	 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1372  	 	 	 $c = $this->utf8_decode($utf8, $charset);
1373  
1374  	 	 	 	 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1375  	 	 	 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1376  	 	 	 if ($cc != '' && $cc != $nochar) {
1377  	 	 	 	 $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1378  	 	 	 }
1379  
1380  	 	 	 	 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1381  	 	 	 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1382  	 	 	 if ($cc != '' && $cc != $nochar) {
1383  	 	 	 	 $this->caseFolding[$charset]['toLower'][$c] = $cc;
1384  	 	 	 }
1385  
1386  	 	 	 	 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1387  	 	 	 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1388  	 	 	 if ($cc != '' && $cc != $nochar) {
1389  	 	 	 	 $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1390  	 	 	 }
1391  	 	 }
1392  
1393  	 	 	 // add the ASCII case table
1394  	 	 for ($i = ord('a'); $i <= ord('z'); $i++) {
1395  	 	 	 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1396  	 	 }
1397  	 	 for ($i = ord('A'); $i <= ord('Z'); $i++) {
1398  	 	 	 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1399  	 	 }
1400  
1401  	 	 if ($cacheFile) {
1402  	 	 	 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1403  	 	 }
1404  
1405  	 	 return 3;
1406  	 }
1407  
1408  	 /**
1409  	  * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1410  	  * This function is automatically called by the ASCII transliteration functions.
1411  	  *
1412  	  * @param	 string	 	 Charset for which to initialize conversion.
1413  	  * @return	 integer	 	 Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1414  	  * @access private
1415  	  */
1416  	function initToASCII($charset) {
1417  	 	 	 // Only process if the case table is not yet loaded:
1418  	 	 if (is_array($this->toASCII[$charset])) {
1419  	 	 	 return 1;
1420  	 	 }
1421  
1422  	 	 	 // Use cached version if possible
1423  	 	 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1424  	 	 if ($cacheFile && @is_file($cacheFile)) {
1425  	 	 	 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1426  	 	 	 return 2;
1427  	 	 }
1428  
1429  	 	 	 // init UTF-8 conversion for this charset
1430  	 	 if (!$this->initCharset($charset)) {
1431  	 	 	 return FALSE;
1432  	 	 }
1433  
1434  	 	 	 // UTF-8/ASCII transliteration is used as the base conversion table
1435  	 	 if (!$this->initUnicodeData('ascii')) {
1436  	 	 	 return FALSE;
1437  	 	 }
1438  
1439  	 	 $nochar = chr($this->noCharByteVal);
1440  	 	 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1441  	 	 	 	 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1442  	 	 	 $c = $this->utf8_decode($utf8, $charset);
1443  
1444  	 	 	 if (isset($this->toASCII['utf-8'][$utf8])) {
1445  	 	 	 	 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1446  	 	 	 }
1447  	 	 }
1448  
1449  	 	 if ($cacheFile) {
1450  	 	 	 t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1451  	 	 }
1452  
1453  	 	 return 3;
1454  	 }
1455  
1456  
1457  	 /********************************************
1458  	  *
1459  	  * String operation functions
1460  	  *
1461  	  ********************************************/
1462  
1463  	 /**
1464  	  * Returns a part of a string.
1465  	  * Unit-tested by Kasper (single byte charsets only)
1466  	  *
1467  	  * @param	 string	 	 The character set
1468  	  * @param	 string	 	 Character string
1469  	  * @param	 integer	 	 Start position (character position)
1470  	  * @param	 integer	 	 Length (in characters)
1471  	  * @return	 string	 	 The substring
1472  	  * @see substr(), mb_substr()
1473  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1474  	  */
1475  	function substr($charset, $string, $start, $len = NULL) {
1476  	 	 if ($len === 0 || $string === '') {
1477  	 	 	 return '';
1478  	 	 }
1479  
1480  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1481  	 	 	 	 // cannot omit $len, when specifying charset
1482  	 	 	 if ($len == NULL) {
1483  	 	 	 	 $enc = mb_internal_encoding(); // save internal encoding
1484  	 	 	 	 mb_internal_encoding($charset);
1485  	 	 	 	 $str = mb_substr($string, $start);
1486  	 	 	 	 mb_internal_encoding($enc); // restore internal encoding
1487  
1488  	 	 	 	 return $str;
1489  	 	 	 }
1490  	 	 	 else {
1491  	 	 	 	 return mb_substr($string, $start, $len, $charset);
1492  	 	 	 }
1493  	 	 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1494  	 	 	 	 // cannot omit $len, when specifying charset
1495  	 	 	 if ($len == NULL) {
1496  	 	 	 	 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1497  	 	 	 	 iconv_set_encoding('internal_encoding', $charset);
1498  	 	 	 	 $str = iconv_substr($string, $start);
1499  	 	 	 	 iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1500  
1501  	 	 	 	 return $str;
1502  	 	 	 }
1503  	 	 	 else {
1504  	 	 	 	 return iconv_substr($string, $start, $len, $charset);
1505  	 	 	 }
1506  	 	 } elseif ($charset == 'utf-8') {
1507  	 	 	 return $this->utf8_substr($string, $start, $len);
1508  	 	 } elseif ($this->eucBasedSets[$charset]) {
1509  	 	 	 return $this->euc_substr($string, $start, $charset, $len);
1510  	 	 } elseif ($this->twoByteSets[$charset]) {
1511  	 	 	 return substr($string, $start * 2, $len * 2);
1512  	 	 } elseif ($this->fourByteSets[$charset]) {
1513  	 	 	 return substr($string, $start * 4, $len * 4);
1514  	 	 }
1515  
1516  	 	 	 // treat everything else as single-byte encoding
1517  	 	 return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1518  	 }
1519  
1520  	 /**
1521  	  * Counts the number of characters.
1522  	  * Unit-tested by Kasper (single byte charsets only)
1523  	  *
1524  	  * @param	 string	 	 The character set
1525  	  * @param	 string	 	 Character string
1526  	  * @return	 integer	 	 The number of characters
1527  	  * @see strlen()
1528  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1529  	  */
1530  	function strlen($charset, $string) {
1531  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1532  	 	 	 return mb_strlen($string, $charset);
1533  	 	 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1534  	 	 	 return iconv_strlen($string, $charset);
1535  	 	 } elseif ($charset == 'utf-8') {
1536  	 	 	 return $this->utf8_strlen($string);
1537  	 	 } elseif ($this->eucBasedSets[$charset]) {
1538  	 	 	 return $this->euc_strlen($string, $charset);
1539  	 	 } elseif ($this->twoByteSets[$charset]) {
1540  	 	 	 return strlen($string) / 2;
1541  	 	 } elseif ($this->fourByteSets[$charset]) {
1542  	 	 	 return strlen($string) / 4;
1543  	 	 }
1544  	 	 	 // treat everything else as single-byte encoding
1545  	 	 return strlen($string);
1546  	 }
1547  
1548  	 /**
1549  	  * Method to crop strings using the mb_substr function.
1550  	  *
1551  	  * @param  string	 	 The character set
1552  	  * @param  string	 	 String to be cropped
1553  	  * @param  integer	 	 Crop length (in characters)
1554  	  * @param  string	 	 Crop signifier
1555  	  * @return string	 	 The shortened string
1556  	  * @see mb_strlen(), mb_substr()
1557  	  */
1558  	protected function cropMbstring($charset, $string, $len, $crop = '') {
1559  	 	 if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1560  	 	 	 return $string;
1561  	 	 }
1562  
1563  	 	 if ($len > 0) {
1564  	 	 	 $string = mb_substr($string, 0, $len, $charset) . $crop;
1565  	 	 } else {
1566  	 	 	 $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1567  	 	 }
1568  
1569  	 	 return $string;
1570  	 }
1571  
1572  	 /**
1573  	  * Truncates a string and pre-/appends a string.
1574  	  * Unit tested by Kasper
1575  	  *
1576  	  * @param	 string	 	 The character set
1577  	  * @param	 string	 	 Character string
1578  	  * @param	 integer	 	 Length (in characters)
1579  	  * @param	 string	 	 Crop signifier
1580  	  * @return	 string	 	 The shortened string
1581  	  * @see substr(), mb_strimwidth()
1582  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1583  	  */
1584  	function crop($charset, $string, $len, $crop = '') {
1585  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1586  	 	 	 return $this->cropMbstring($charset, $string, $len, $crop);
1587  	 	 }
1588  
1589  	 	 if (intval($len) == 0) {
1590  	 	 	 return $string;
1591  	 	 }
1592  
1593  	 	 if ($charset == 'utf-8') {
1594  	 	 	 $i = $this->utf8_char2byte_pos($string, $len);
1595  	 	 } elseif ($this->eucBasedSets[$charset]) {
1596  	 	 	 $i = $this->euc_char2byte_pos($string, $len, $charset);
1597  	 	 } else {
1598  	 	 	 if ($len > 0) {
1599  	 	 	 	 $i = $len;
1600  	 	 	 } else {
1601  	 	 	 	 $i = strlen($string) + $len;
1602  	 	 	 	 if ($i <= 0) {
1603  	 	 	 	 	 $i = FALSE;
1604  	 	 	 	 }
1605  	 	 	 }
1606  	 	 }
1607  
1608  	 	 if ($i === FALSE) { // $len outside actual string length
1609  	 	 	 return $string;
1610  	 	 } else {
1611  	 	 	 if ($len > 0) {
1612  	 	 	 	 if (strlen($string[$i])) {
1613  	 	 	 	 	 return substr($string, 0, $i) . $crop;
1614  
1615  	 	 	 	 }
1616  	 	 	 } else {
1617  	 	 	 	 if (strlen($string[$i - 1])) {
1618  	 	 	 	 	 return $crop . substr($string, $i);
1619  	 	 	 	 }
1620  	 	 	 }
1621  
1622  	 	 	 /*
1623  	 	 	    if (abs($len)<$this->strlen($charset,$string))	 {	 // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1624  	 	 	 	    if ($len > 0)	 {
1625  	 	 	 	 	    return substr($string,0,$i).$crop;
1626  	 	 	 	    } else {
1627  	 	 	 	 	    return $crop.substr($string,$i);
1628  	 	 	 	    }
1629  	 	 	    }
1630     */
1631  	 	 }
1632  	 	 return $string;
1633  	 }
1634  
1635  	 /**
1636  	  * Cuts a string short at a given byte length.
1637  	  *
1638  	  * @param	 string	 	 The character set
1639  	  * @param	 string	 	 Character string
1640  	  * @param	 integer	 	 The byte length
1641  	  * @return	 string	 	 The shortened string
1642  	  * @see mb_strcut()
1643  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1644  	  */
1645  	function strtrunc($charset, $string, $len) {
1646  	 	 if ($len <= 0) {
1647  	 	 	 return '';
1648  	 	 }
1649  
1650  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1651  	 	 	 return mb_strcut($string, 0, $len, $charset);
1652  	 	 } elseif ($charset == 'utf-8') {
1653  	 	 	 return $this->utf8_strtrunc($string, $len);
1654  	 	 } elseif ($this->eucBasedSets[$charset]) {
1655  	 	 	 return $this->euc_strtrunc($string, $len, $charset);
1656  	 	 } elseif ($this->twoByteSets[$charset]) {
1657  	 	 	 if ($len % 2) {
1658  	 	 	 	 $len--;
1659  	 	 	 } // don't cut at odd positions
1660  	 	 } elseif ($this->fourByteSets[$charset]) {
1661  	 	 	 $x = $len % 4;
1662  	 	 	 $len -= $x; // realign to position dividable by four
1663  	 	 }
1664  	 	 	 // treat everything else as single-byte encoding
1665  	 	 return substr($string, 0, $len);
1666  	 }
1667  
1668  	 /**
1669  	  * Translates all characters of a string into their respective case values.
1670  	  * Unlike strtolower() and strtoupper() this method is locale independent.
1671  	  * Note that the string length may change!
1672  	  * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1673  	  * Unit-tested by Kasper
1674  	  * Real case folding is language dependent, this method ignores this fact.
1675  	  *
1676  	  * @param	 string	 	 Character set of string
1677  	  * @param	 string	 	 Input string to convert case for
1678  	  * @param	 string	 	 Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1679  	  * @return	 string	 	 The converted string
1680  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1681  	  * @see strtolower(), strtoupper()
1682  	  */
1683  	function conv_case($charset, $string, $case) {
1684  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1685  	 	 	 if ($case == 'toLower') {
1686  	 	 	 	 $string = mb_strtolower($string, $charset);
1687  	 	 	 } else {
1688  	 	 	 	 $string = mb_strtoupper($string, $charset);
1689  	 	 	 }
1690  	 	 } elseif ($charset == 'utf-8') {
1691  	 	 	 $string = $this->utf8_char_mapping($string, 'case', $case);
1692  	 	 } elseif (isset($this->eucBasedSets[$charset])) {
1693  	 	 	 $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1694  	 	 } else {
1695  	 	 	 	 // treat everything else as single-byte encoding
1696  	 	 	 $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1697  	 	 }
1698  
1699  	 	 return $string;
1700  	 }
1701  
1702  	 /**
1703  	  * Equivalent of lcfirst/ucfirst but using character set.
1704  	  *
1705  	  * @param string $charset
1706  	  * @param string $string
1707  	  * @param string $case
1708  	  * @return string
1709  	  * @see t3lib_cs::conv_case()
1710  	  */
1711  	public function convCaseFirst($charset, $string, $case) {
1712  	 	 $firstChar = $this->substr($charset, $string, 0, 1);
1713  	 	 $firstChar = $this->conv_case($charset, $firstChar, $case);
1714  	 	 $remainder = $this->substr($charset, $string, 1);
1715  	 	 return $firstChar . $remainder;
1716  	 }
1717  
1718  	 /**
1719  	  * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1720  	  *
1721  	  * @param string $charset Character set of string
1722  	  * @param string $string Input string to convert
1723  	  * @return string The converted string
1724  	  */
1725  	function specCharsToASCII($charset, $string) {
1726  	 	 if ($charset == 'utf-8') {
1727  	 	 	 $string = $this->utf8_char_mapping($string, 'ascii');
1728  	 	 } elseif (isset($this->eucBasedSets[$charset])) {
1729  	 	 	 $string = $this->euc_char_mapping($string, $charset, 'ascii');
1730  	 	 } else {
1731  	 	 	 	 // treat everything else as single-byte encoding
1732  	 	 	 $string = $this->sb_char_mapping($string, $charset, 'ascii');
1733  	 	 }
1734  
1735  	 	 return $string;
1736  	 }
1737  
1738  
1739  	 /**
1740  	  * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1741  	  * into a TYPO3-readable language code
1742  	  * @param	 $languageCodesList	 list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1743  	  *	 	 	  see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1744  	  * @return	 string	 a preferred language that TYPO3 supports, or "default" if none found
1745  	  * @author	 Benjamin Mack (benni.typo3.org)
1746  	  */
1747  	public function getPreferredClientLanguage($languageCodesList) {
1748  	 	 $allLanguageCodes = array();
1749  	 	 $selectedLanguage = 'default';
1750  
1751  	 	 	 // get all languages where TYPO3 code is the same as the ISO code
1752  	 	 foreach ($this->charSetArray as $typo3Lang => $charSet) {
1753  	 	 	 $allLanguageCodes[$typo3Lang] = $typo3Lang;
1754  	 	 }
1755  
1756  	 	 	 // get all languages where TYPO3 code differs from ISO code
1757  	 	 	 // or needs the country part
1758  	 	 	 // the iso codes will here overwrite the default typo3 language in the key
1759  	 	 foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1760  	 	 	 $isoLang = join('-', explode('_', $isoLang));
1761  	 	 	 $allLanguageCodes[$typo3Lang] = $isoLang;
1762  	 	 }
1763  
1764  	 	 	 // move the iso codes to the (because we're comparing the keys with "isset" later on)
1765  	 	 $allLanguageCodes = array_flip($allLanguageCodes);
1766  
1767  
1768  	 	 $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1769  	 	 	 // order the preferred languages after they key
1770  	 	 $sortedPreferredLanguages = array();
1771  	 	 foreach ($preferredLanguages as $preferredLanguage) {
1772  	 	 	 $quality = 1.0;
1773  	 	 	 if (strpos($preferredLanguage, ';q=') !== FALSE) {
1774  	 	 	 	 list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1775  	 	 	 }
1776  	 	 	 $sortedPreferredLanguages[$preferredLanguage] = $quality;
1777  	 	 }
1778  
1779  	 	 	 // loop through the languages, with the highest priority first
1780  	 	 arsort($sortedPreferredLanguages, SORT_NUMERIC);
1781  	 	 foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1782  	 	 	 if (isset($allLanguageCodes[$preferredLanguage])) {
1783  	 	 	 	 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1784  	 	 	 	 break;
1785  	 	 	 }
1786  
1787  	 	 	 	 // strip the country code from the end
1788  	 	 	 list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1789  	 	 	 if (isset($allLanguageCodes[$preferredLanguage])) {
1790  	 	 	 	 $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1791  	 	 	 	 break;
1792  	 	 	 }
1793  	 	 }
1794  	 	 if (!$selectedLanguage || $selectedLanguage == 'en') {
1795  	 	 	 $selectedLanguage = 'default';
1796  	 	 }
1797  	 	 return $selectedLanguage;
1798  	 }
1799  
1800  
1801  	 /********************************************
1802  	  *
1803  	  * Internal string operation functions
1804  	  *
1805  	  ********************************************/
1806  
1807  	 /**
1808  	  * Maps all characters of a string in a single byte charset.
1809  	  *
1810  	  * @param	 string	 	 the string
1811  	  * @param	 string	 	 the charset
1812  	  * @param	 string	 	 mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1813  	  * @param	 string	 	 'case': conversion 'toLower' or 'toUpper'
1814  	  * @return	 string	 	 the converted string
1815  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1816  	  */
1817  	function sb_char_mapping($str, $charset, $mode, $opt = '') {
1818  	 	 switch ($mode) {
1819  	 	 	 case 'case':
1820  	 	 	 	 if (!$this->initCaseFolding($charset)) {
1821  	 	 	 	 	 return $str;
1822  	 	 	 	 } // do nothing
1823  	 	 	 	 $map =& $this->caseFolding[$charset][$opt];
1824  	 	 	 	 break;
1825  
1826  	 	 	 case 'ascii':
1827  	 	 	 	 if (!$this->initToASCII($charset)) {
1828  	 	 	 	 	 return $str;
1829  	 	 	 	 } // do nothing
1830  	 	 	 	 $map =& $this->toASCII[$charset];
1831  	 	 	 	 break;
1832  
1833  	 	 	 default:
1834  	 	 	 	 return $str;
1835  	 	 }
1836  
1837  	 	 $out = '';
1838  	 	 for ($i = 0; strlen($str[$i]); $i++) {
1839  	 	 	 $c = $str[$i];
1840  	 	 	 if (isset($map[$c])) {
1841  	 	 	 	 $out .= $map[$c];
1842  	 	 	 } else {
1843  	 	 	 	 $out .= $c;
1844  	 	 	 }
1845  	 	 }
1846  
1847  	 	 return $out;
1848  	 }
1849  
1850  
1851  	 /********************************************
1852  	  *
1853  	  * Internal UTF-8 string operation functions
1854  	  *
1855  	  ********************************************/
1856  
1857  	 /**
1858  	  * Returns a part of a UTF-8 string.
1859  	  * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1860  	  *
1861  	  * @param	 string	 	 UTF-8 string
1862  	  * @param	 integer	 	 Start position (character position)
1863  	  * @param	 integer	 	 Length (in characters)
1864  	  * @return	 string	 	 The substring
1865  	  * @see substr()
1866  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1867  	  */
1868  	function utf8_substr($str, $start, $len = NULL) {
1869  	 	 if (!strcmp($len, '0')) {
1870  	 	 	 return '';
1871  	 	 }
1872  
1873  	 	 $byte_start = $this->utf8_char2byte_pos($str, $start);
1874  	 	 if ($byte_start === FALSE) {
1875  	 	 	 if ($start > 0) {
1876  	 	 	 	 return FALSE; // $start outside string length
1877  	 	 	 } else {
1878  	 	 	 	 $start = 0;
1879  	 	 	 }
1880  	 	 }
1881  
1882  	 	 $str = substr($str, $byte_start);
1883  
1884  	 	 if ($len != NULL) {
1885  	 	 	 $byte_end = $this->utf8_char2byte_pos($str, $len);
1886  	 	 	 if ($byte_end === FALSE) // $len outside actual string length
1887  	 	 	 {
1888  	 	 	 	 return $len < 0 ? '' : $str;
1889  	 	 	 } // When length is less than zero and exceeds, then we return blank string.
1890  	 	 	 else
1891  	 	 	 {
1892  	 	 	 	 return substr($str, 0, $byte_end);
1893  	 	 	 }
1894  	 	 }
1895  	 	 else	 {
1896  	 	 	 return $str;
1897  	 	 }
1898  	 }
1899  
1900  	 /**
1901  	  * Counts the number of characters of a string in UTF-8.
1902  	  * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1903  	  *
1904  	  * @param	 string	 	 UTF-8 multibyte character string
1905  	  * @return	 integer	 	 The number of characters
1906  	  * @see strlen()
1907  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1908  	  */
1909  	function utf8_strlen($str) {
1910  	 	 $n = 0;
1911  	 	 for ($i = 0; strlen($str[$i]); $i++) {
1912  	 	 	 $c = ord($str[$i]);
1913  	 	 	 if (!($c & 0x80)) // single-byte (0xxxxxx)
1914  	 	 	 {
1915  	 	 	 	 $n++;
1916  	 	 	 }
1917  	 	 	 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1918  	 	 	 {
1919  	 	 	 	 $n++;
1920  	 	 	 }
1921  	 	 }
1922  	 	 return $n;
1923  	 }
1924  
1925  	 /**
1926  	  * Truncates a string in UTF-8 short at a given byte length.
1927  	  *
1928  	  * @param	 string	 	 UTF-8 multibyte character string
1929  	  * @param	 integer	 	 the byte length
1930  	  * @return	 string	 	 the shortened string
1931  	  * @see mb_strcut()
1932  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1933  	  */
1934  	function utf8_strtrunc($str, $len) {
1935  	 	 $i = $len - 1;
1936  	 	 if (ord($str[$i]) & 0x80) { // part of a multibyte sequence
1937  	 	 	 for (; $i > 0 && !(ord($str[$i]) & 0x40); $i--) {
1938  	 	 	 	 // find the first byte
1939  	 	 	 	 ;
1940  	 	 	 }
1941  	 	 	 if ($i <= 0) {
1942  	 	 	 	 return '';
1943  	 	 	 } // sanity check
1944  	 	 	 for ($bc = 0, $mbs = ord($str[$i]); $mbs & 0x80; $mbs = $mbs << 1) {
1945  	 	 	 	 // calculate number of bytes
1946  	 	 	 	 $bc++;
1947  	 	 	 }
1948  	 	 	 if ($bc + $i > $len) {
1949  	 	 	 	 return substr($str, 0, $i);
1950  	 	 	 }
1951  	 	 	 // fallthru: multibyte char fits into length
1952  	 	 }
1953  	 	 return substr($str, 0, $len);
1954  	 }
1955  
1956  	 /**
1957  	  * Find position of first occurrence of a string, both arguments are in UTF-8.
1958  	  *
1959  	  * @param	 string	 	 UTF-8 string to search in
1960  	  * @param	 string	 	 UTF-8 string to search for
1961  	  * @param	 integer	 	 Positition to start the search
1962  	  * @return	 integer	 	 The character position
1963  	  * @see strpos()
1964  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1965  	  */
1966  	function utf8_strpos($haystack, $needle, $offset = 0) {
1967  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1968  	 	 	 return mb_strpos($haystack, $needle, $offset, 'utf-8');
1969  	 	 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1970  	 	 	 return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1971  	 	 }
1972  
1973  	 	 $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1974  	 	 if ($byte_offset === FALSE) {
1975  	 	 	 return FALSE;
1976  	 	 } // offset beyond string length
1977  
1978  	 	 $byte_pos = strpos($haystack, $needle, $byte_offset);
1979  	 	 if ($byte_pos === FALSE) {
1980  	 	 	 return FALSE;
1981  	 	 } // needle not found
1982  
1983  	 	 return $this->utf8_byte2char_pos($haystack, $byte_pos);
1984  	 }
1985  
1986  	 /**
1987  	  * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1988  	  *
1989  	  * @param	 string	 	 UTF-8 string to search in
1990  	  * @param	 string	 	 UTF-8 character to search for (single character)
1991  	  * @return	 integer	 	 The character position
1992  	  * @see strrpos()
1993  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
1994  	  */
1995  	function utf8_strrpos($haystack, $needle) {
1996  	 	 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1997  	 	 	 return mb_strrpos($haystack, $needle, 'utf-8');
1998  	 	 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1999  	 	 	 return iconv_strrpos($haystack, $needle, 'utf-8');
2000  	 	 }
2001  
2002  	 	 $byte_pos = strrpos($haystack, $needle);
2003  	 	 if ($byte_pos === FALSE) {
2004  	 	 	 return FALSE;
2005  	 	 } // needle not found
2006  
2007  	 	 return $this->utf8_byte2char_pos($haystack, $byte_pos);
2008  	 }
2009  
2010  	 /**
2011  	  * Translates a character position into an 'absolute' byte position.
2012  	  * Unit tested by Kasper.
2013  	  *
2014  	  * @param	 string	 	 UTF-8 string
2015  	  * @param	 integer	 	 Character position (negative values start from the end)
2016  	  * @return	 integer	 	 Byte position
2017  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2018  	  */
2019  	function utf8_char2byte_pos($str, $pos) {
2020  	 	 $n = 0; // number of characters found
2021  	 	 $p = abs($pos); // number of characters wanted
2022  
2023  	 	 if ($pos >= 0) {
2024  	 	 	 $i = 0;
2025  	 	 	 $d = 1;
2026  	 	 } else {
2027  	 	 	 $i = strlen($str) - 1;
2028  	 	 	 $d = -1;
2029  	 	 }
2030  
2031  	 	 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2032  	 	 	 $c = (int) ord($str[$i]);
2033  	 	 	 if (!($c & 0x80)) // single-byte (0xxxxxx)
2034  	 	 	 {
2035  	 	 	 	 $n++;
2036  	 	 	 }
2037  	 	 	 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2038  	 	 	 {
2039  	 	 	 	 $n++;
2040  	 	 	 }
2041  	 	 }
2042  	 	 if (!strlen($str[$i])) {
2043  	 	 	 return FALSE;
2044  	 	 } // offset beyond string length
2045  
2046  	 	 if ($pos >= 0) {
2047  	 	 	 	 // skip trailing multi-byte data bytes
2048  	 	 	 while ((ord($str[$i]) & 0x80) && !(ord($str[$i]) & 0x40)) {
2049  	 	 	 	 $i++;
2050  	 	 	 }
2051  	 	 } else {
2052  	 	 	 	 // correct offset
2053  	 	 	 $i++;
2054  	 	 }
2055  
2056  	 	 return $i;
2057  	 }
2058  
2059  	 /**
2060  	  * Translates an 'absolute' byte position into a character position.
2061  	  * Unit tested by Kasper.
2062  	  *
2063  	  * @param	 string	 	 UTF-8 string
2064  	  * @param	 integer	 	 byte position
2065  	  * @return	 integer	 	 character position
2066  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2067  	  */
2068  	function utf8_byte2char_pos($str, $pos) {
2069  	 	 $n = 0; // number of characters
2070  	 	 for ($i = $pos; $i > 0; $i--) {
2071  	 	 	 $c = (int) ord($str[$i]);
2072  	 	 	 if (!($c & 0x80)) // single-byte (0xxxxxx)
2073  	 	 	 {
2074  	 	 	 	 $n++;
2075  	 	 	 }
2076  	 	 	 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2077  	 	 	 {
2078  	 	 	 	 $n++;
2079  	 	 	 }
2080  	 	 }
2081  	 	 if (!strlen($str[$i])) {
2082  	 	 	 return FALSE;
2083  	 	 } // offset beyond string length
2084  
2085  	 	 return $n;
2086  	 }
2087  
2088  	 /**
2089  	  * Maps all characters of an UTF-8 string.
2090  	  *
2091  	  * @param	 string	 	 UTF-8 string
2092  	  * @param	 string	 	 mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2093  	  * @param	 string	 	 'case': conversion 'toLower' or 'toUpper'
2094  	  * @return	 string	 	 the converted string
2095  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2096  	  */
2097  	function utf8_char_mapping($str, $mode, $opt = '') {
2098  	 	 if (!$this->initUnicodeData($mode)) {
2099  	 	 	 return $str;
2100  	 	 } // do nothing
2101  
2102  	 	 $out = '';
2103  	 	 switch ($mode) {
2104  	 	 	 case 'case':
2105  	 	 	 	 $map =& $this->caseFolding['utf-8'][$opt];
2106  	 	 	 	 break;
2107  
2108  	 	 	 case 'ascii':
2109  	 	 	 	 $map =& $this->toASCII['utf-8'];
2110  	 	 	 	 break;
2111  
2112  	 	 	 default:
2113  	 	 	 	 return $str;
2114  	 	 }
2115  
2116  	 	 for ($i = 0; strlen($str[$i]); $i++) {
2117  	 	 	 $c = ord($str[$i]);
2118  	 	 	 if (!($c & 0x80)) // single-byte (0xxxxxx)
2119  	 	 	 {
2120  	 	 	 	 $mbc = $str[$i];
2121  	 	 	 }
2122  	 	 	 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2123  	 	 	 	 for ($bc = 0; $c & 0x80; $c = $c << 1) {
2124  	 	 	 	 	 $bc++;
2125  	 	 	 	 } // calculate number of bytes
2126  	 	 	 	 $mbc = substr($str, $i, $bc);
2127  	 	 	 	 $i += $bc - 1;
2128  	 	 	 }
2129  
2130  	 	 	 if (isset($map[$mbc])) {
2131  	 	 	 	 $out .= $map[$mbc];
2132  	 	 	 } else {
2133  	 	 	 	 $out .= $mbc;
2134  	 	 	 }
2135  	 	 }
2136  
2137  	 	 return $out;
2138  	 }
2139  
2140  
2141  	 /********************************************
2142  	  *
2143  	  * Internal EUC string operation functions
2144  	  *
2145  	  * Extended Unix Code:
2146  	  *  ASCII compatible 7bit single bytes chars
2147  	  *  8bit two byte chars
2148  	  *
2149  	  * Shift-JIS is treated as a special case.
2150  	  *
2151  	  ********************************************/
2152  
2153  	 /**
2154  	  * Cuts a string in the EUC charset family short at a given byte length.
2155  	  *
2156  	  * @param	 string	 	 EUC multibyte character string
2157  	  * @param	 integer	 	 the byte length
2158  	  * @param	 string	 	 the charset
2159  	  * @return	 string	 	 the shortened string
2160  	  * @see mb_strcut()
2161  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2162  	  */
2163  	function euc_strtrunc($str, $len, $charset) {
2164  	 	 $sjis = ($charset == 'shift_jis');
2165  	 	 for ($i = 0; strlen($str[$i]) && $i < $len; $i++) {
2166  	 	 	 $c = ord($str[$i]);
2167  	 	 	 if ($sjis) {
2168  	 	 	 	 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2169  	 	 	 	 	 $i++;
2170  	 	 	 	 } // advance a double-byte char
2171  	 	 	 }
2172  	 	 	 else {
2173  	 	 	 	 if ($c >= 0x80) {
2174  	 	 	 	 	 $i++;
2175  	 	 	 	 } // advance a double-byte char
2176  	 	 	 }
2177  	 	 }
2178  	 	 if (!strlen($str[$i])) {
2179  	 	 	 return $str;
2180  	 	 } // string shorter than supplied length
2181  
2182  	 	 if ($i > $len) {
2183  	 	 	 return substr($str, 0, $len - 1); // we ended on a first byte
2184  	 	 } else {
2185  	 	 	 return substr($str, 0, $len);
2186  	 	 }
2187  	 }
2188  
2189  	 /**
2190  	  * Returns a part of a string in the EUC charset family.
2191  	  *
2192  	  * @param	 string	 	 EUC multibyte character string
2193  	  * @param	 integer	 	 start position (character position)
2194  	  * @param	 string	 	 the charset
2195  	  * @param	 integer	 	 length (in characters)
2196  	  * @return	 string	 	 the substring
2197  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2198  	  */
2199  	function euc_substr($str, $start, $charset, $len = NULL) {
2200  	 	 $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2201  	 	 if ($byte_start === FALSE) {
2202  	 	 	 return FALSE;
2203  	 	 } // $start outside string length
2204  
2205  	 	 $str = substr($str, $byte_start);
2206  
2207  	 	 if ($len != NULL) {
2208  	 	 	 $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2209  	 	 	 if ($byte_end === FALSE) // $len outside actual string length
2210  	 	 	 {
2211  	 	 	 	 return $str;
2212  	 	 	 }
2213  	 	 	 else
2214  	 	 	 {
2215  	 	 	 	 return substr($str, 0, $byte_end);
2216  	 	 	 }
2217  	 	 }
2218  	 	 else	 {
2219  	 	 	 return $str;
2220  	 	 }
2221  	 }
2222  
2223  	 /**
2224  	  * Counts the number of characters of a string in the EUC charset family.
2225  	  *
2226  	  * @param	 string	 	 EUC multibyte character string
2227  	  * @param	 string	 	 the charset
2228  	  * @return	 integer	 	 the number of characters
2229  	  * @see strlen()
2230  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2231  	  */
2232  	function euc_strlen($str, $charset) {
2233  	 	 $sjis = ($charset == 'shift_jis');
2234  	 	 $n = 0;
2235  	 	 for ($i = 0; strlen($str[$i]); $i++) {
2236  	 	 	 $c = ord($str[$i]);
2237  	 	 	 if ($sjis) {
2238  	 	 	 	 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2239  	 	 	 	 	 $i++;
2240  	 	 	 	 } // advance a double-byte char
2241  	 	 	 }
2242  	 	 	 else {
2243  	 	 	 	 if ($c >= 0x80) {
2244  	 	 	 	 	 $i++;
2245  	 	 	 	 } // advance a double-byte char
2246  	 	 	 }
2247  
2248  	 	 	 $n++;
2249  	 	 }
2250  
2251  	 	 return $n;
2252  	 }
2253  
2254  	 /**
2255  	  * Translates a character position into an 'absolute' byte position.
2256  	  *
2257  	  * @param	 string	 	 EUC multibyte character string
2258  	  * @param	 integer	 	 character position (negative values start from the end)
2259  	  * @param	 string	 	 the charset
2260  	  * @return	 integer	 	 byte position
2261  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2262  	  */
2263  	function euc_char2byte_pos($str, $pos, $charset) {
2264  	 	 $sjis = ($charset == 'shift_jis');
2265  	 	 $n = 0; // number of characters seen
2266  	 	 $p = abs($pos); // number of characters wanted
2267  
2268  	 	 if ($pos >= 0) {
2269  	 	 	 $i = 0;
2270  	 	 	 $d = 1;
2271  	 	 } else {
2272  	 	 	 $i = strlen($str) - 1;
2273  	 	 	 $d = -1;
2274  	 	 }
2275  
2276  	 	 for (; strlen($str[$i]) && $n < $p; $i += $d) {
2277  	 	 	 $c = ord($str[$i]);
2278  	 	 	 if ($sjis) {
2279  	 	 	 	 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2280  	 	 	 	 	 $i += $d;
2281  	 	 	 	 } // advance a double-byte char
2282  	 	 	 }
2283  	 	 	 else {
2284  	 	 	 	 if ($c >= 0x80) {
2285  	 	 	 	 	 $i += $d;
2286  	 	 	 	 } // advance a double-byte char
2287  	 	 	 }
2288  
2289  	 	 	 $n++;
2290  	 	 }
2291  	 	 if (!strlen($str[$i])) {
2292  	 	 	 return FALSE;
2293  	 	 } // offset beyond string length
2294  
2295  	 	 if ($pos < 0) {
2296  	 	 	 $i++;
2297  	 	 } // correct offset
2298  
2299  	 	 return $i;
2300  	 }
2301  
2302  	 /**
2303  	  * Maps all characters of a string in the EUC charset family.
2304  	  *
2305  	  * @param	 string	 	 EUC multibyte character string
2306  	  * @param	 string	 	 the charset
2307  	  * @param	 string	 	 mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2308  	  * @param	 string	 	 'case': conversion 'toLower' or 'toUpper'
2309  	  * @return	 string	 	 the converted string
2310  	  * @author	 Martin Kutschker <martin.t.kutschker@blackbox.net>
2311  	  */
2312  	function euc_char_mapping($str, $charset, $mode, $opt = '') {
2313  	 	 switch ($mode) {
2314  	 	 	 case 'case':
2315  	 	 	 	 if (!$this->initCaseFolding($charset)) {
2316  	 	 	 	 	 return $str;
2317  	 	 	 	 } // do nothing
2318  	 	 	 	 $map =& $this->caseFolding[$charset][$opt];
2319  	 	 	 	 break;
2320  
2321  	 	 	 case 'ascii':
2322  	 	 	 	 if (!$this->initToASCII($charset)) {
2323  	 	 	 	 	 return $str;
2324  	 	 	 	 } // do nothing
2325  	 	 	 	 $map =& $this->toASCII[$charset];
2326  	 	 	 	 break;
2327  
2328  	 	 	 default:
2329  	 	 	 	 return $str;
2330  	 	 }
2331  
2332  	 	 $sjis = ($charset == 'shift_jis');
2333  	 	 $out = '';
2334  	 	 for ($i = 0; strlen($str[$i]); $i++) {
2335  	 	 	 $mbc = $str[$i];
2336  	 	 	 $c = ord($mbc);
2337  
2338  	 	 	 if ($sjis) {
2339  	 	 	 	 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2340  	 	 	 	 	 $mbc = substr($str, $i, 2);
2341  	 	 	 	 	 $i++;
2342  	 	 	 	 }
2343  	 	 	 }
2344  	 	 	 else {
2345  	 	 	 	 if ($c >= 0x80) { // a double-byte char
2346  	 	 	 	 	 $mbc = substr($str, $i, 2);
2347  	 	 	 	 	 $i++;
2348  	 	 	 	 }
2349  	 	 	 }
2350  
2351  	 	 	 if (isset($map[$mbc])) {
2352  	 	 	 	 $out .= $map[$mbc];
2353  	 	 	 } else {
2354  	 	 	 	 $out .= $mbc;
2355  	 	 	 }
2356  	 	 }
2357  
2358  	 	 return $out;
2359  	 }
2360  
2361  }
2362  
2363  if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2364  	 include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2365  }
2366  
2367  ?>