See Release Notes
Long Term Support Release
Differences Between: [Versions 401 and 402] [Versions 401 and 403]
1 <?php 2 3 namespace PhpXmlRpc\Helper; 4 5 use PhpXmlRpc\PhpXmlRpc; 6 use PhpXmlRpc\Value; 7 8 /** 9 * Deals with parsing the XML. 10 * @see http://xmlrpc.com/spec.md 11 * 12 * @todo implement an interface to allow for alternative implementations 13 * - make access to $_xh protected, return more high-level data structures 14 * - add parseRequest, parseResponse, parseValue methods 15 * @todo if iconv() or mb_string() are available, we could allow to convert the received xml to a custom charset encoding 16 * while parsing, which is faster than doing it later by going over the rebuilt data structure 17 */ 18 class XMLParser 19 { 20 const RETURN_XMLRPCVALS = 'xmlrpcvals'; 21 const RETURN_EPIVALS = 'epivals'; 22 const RETURN_PHP = 'phpvals'; 23 24 const ACCEPT_REQUEST = 1; 25 const ACCEPT_RESPONSE = 2; 26 const ACCEPT_VALUE = 4; 27 const ACCEPT_FAULT = 8; 28 29 // Used to store state during parsing and to pass parsing results to callers. 30 // Quick explanation of components: 31 // private: 32 // ac - used to accumulate values 33 // stack - array with genealogy of xml elements names used to validate nesting of xmlrpc elements 34 // valuestack - array used for parsing arrays and structs 35 // lv - used to indicate "looking for a value": implements the logic to allow values with no types to be strings 36 // public: 37 // isf - used to indicate an xml parsing fault (3), invalid xmlrpc fault (2) or xmlrpc response fault (1) 38 // isf_reason - used for storing xmlrpc response fault string 39 // value - used to store the value in responses 40 // method - used to store method name in requests 41 // params - used to store parameters in requests 42 // pt - used to store the type of each received parameter. Useful if parameters are automatically decoded to php values 43 // rt - 'methodcall', 'methodresponse', 'value' or 'fault' (the last one used only in EPI emulation mode) 44 public $_xh = array( 45 'ac' => '', 46 'stack' => array(), 47 'valuestack' => array(), 48 'isf' => 0, 49 'isf_reason' => '', 50 'value' => null, 51 'method' => false, 52 'params' => array(), 53 'pt' => array(), 54 'rt' => '', 55 ); 56 57 public $xmlrpc_valid_parents = array( 58 'VALUE' => array('MEMBER', 'DATA', 'PARAM', 'FAULT'), 59 'BOOLEAN' => array('VALUE'), 60 'I4' => array('VALUE'), 61 'I8' => array('VALUE'), 62 'EX:I8' => array('VALUE'), 63 'INT' => array('VALUE'), 64 'STRING' => array('VALUE'), 65 'DOUBLE' => array('VALUE'), 66 'DATETIME.ISO8601' => array('VALUE'), 67 'BASE64' => array('VALUE'), 68 'MEMBER' => array('STRUCT'), 69 'NAME' => array('MEMBER'), 70 'DATA' => array('ARRAY'), 71 'ARRAY' => array('VALUE'), 72 'STRUCT' => array('VALUE'), 73 'PARAM' => array('PARAMS'), 74 'METHODNAME' => array('METHODCALL'), 75 'PARAMS' => array('METHODCALL', 'METHODRESPONSE'), 76 'FAULT' => array('METHODRESPONSE'), 77 'NIL' => array('VALUE'), // only used when extension activated 78 'EX:NIL' => array('VALUE'), // only used when extension activated 79 ); 80 81 /** @var array $parsing_options */ 82 protected $parsing_options = array(); 83 /** @var int $accept self::ACCEPT_REQUEST | self::ACCEPT_RESPONSE by default */ 84 protected $accept = 3; 85 /** @var int $maxChunkLength 4 MB by default. Any value below 10MB should be good */ 86 protected $maxChunkLength = 4194304; 87 88 /** 89 * @param array $options passed to the xml parser 90 */ 91 public function __construct(array $options = array()) 92 { 93 $this->parsing_options = $options; 94 } 95 96 /** 97 * @param string $data 98 * @param string $returnType 99 * @param int $accept a bit-combination of self::ACCEPT_REQUEST, self::ACCEPT_RESPONSE, self::ACCEPT_VALUE 100 * @param array $options 101 */ 102 public function parse($data, $returnType = self::RETURN_XMLRPCVALS, $accept = 3, $options = array()) 103 { 104 $this->_xh = array( 105 'ac' => '', 106 'stack' => array(), 107 'valuestack' => array(), 108 'isf' => 0, 109 'isf_reason' => '', 110 'value' => null, 111 'method' => false, // so we can check later if we got a methodname or not 112 'params' => array(), 113 'pt' => array(), 114 'rt' => '', 115 ); 116 117 $len = strlen($data); 118 119 // we test for empty documents here to save on resource allocation and simply the chunked-parsing loop below 120 if ($len == 0) { 121 $this->_xh['isf'] = 3; 122 $this->_xh['isf_reason'] = 'XML error 5: empty document'; 123 return; 124 } 125 126 $parser = xml_parser_create(); 127 128 foreach ($this->parsing_options as $key => $val) { 129 xml_parser_set_option($parser, $key, $val); 130 } 131 foreach ($options as $key => $val) { 132 xml_parser_set_option($parser, $key, $val); 133 } 134 // always set this, in case someone tries to disable it via options... 135 xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 1); 136 137 xml_set_object($parser, $this); 138 139 switch($returnType) { 140 case self::RETURN_PHP: 141 xml_set_element_handler($parser, 'xmlrpc_se', 'xmlrpc_ee_fast'); 142 break; 143 case self::RETURN_EPIVALS: 144 xml_set_element_handler($parser, 'xmlrpc_se', 'xmlrpc_ee_epi'); 145 break; 146 default: 147 xml_set_element_handler($parser, 'xmlrpc_se', 'xmlrpc_ee'); 148 } 149 150 xml_set_character_data_handler($parser, 'xmlrpc_cd'); 151 xml_set_default_handler($parser, 'xmlrpc_dh'); 152 153 $this->accept = $accept; 154 155 // @see ticket #70 - we have to parse big xml docks in chunks to avoid errors 156 for ($offset = 0; $offset < $len; $offset += $this->maxChunkLength) { 157 $chunk = substr($data, $offset, $this->maxChunkLength); 158 // error handling: xml not well formed 159 if (!xml_parse($parser, $chunk, $offset + $this->maxChunkLength >= $len)) { 160 $errCode = xml_get_error_code($parser); 161 $errStr = sprintf('XML error %s: %s at line %d, column %d', $errCode, xml_error_string($errCode), 162 xml_get_current_line_number($parser), xml_get_current_column_number($parser)); 163 164 $this->_xh['isf'] = 3; 165 $this->_xh['isf_reason'] = $errStr; 166 break; 167 } 168 } 169 170 xml_parser_free($parser); 171 } 172 173 /** 174 * xml parser handler function for opening element tags. 175 * @internal 176 * @param resource $parser 177 * @param string $name 178 * @param $attrs 179 * @param bool $acceptSingleVals DEPRECATED use the $accept parameter instead 180 */ 181 public function xmlrpc_se($parser, $name, $attrs, $acceptSingleVals = false) 182 { 183 // if invalid xmlrpc already detected, skip all processing 184 if ($this->_xh['isf'] < 2) { 185 186 // check for correct element nesting 187 if (count($this->_xh['stack']) == 0) { 188 // top level element can only be of 2 types 189 /// @todo optimization creep: save this check into a bool variable, instead of using count() every time: 190 /// there is only a single top level element in xml anyway 191 // BC 192 if ($acceptSingleVals === false) { 193 $accept = $this->accept; 194 } else { 195 $accept = self::ACCEPT_REQUEST | self::ACCEPT_RESPONSE | self::ACCEPT_VALUE; 196 } 197 if (($name == 'METHODCALL' && ($accept & self::ACCEPT_REQUEST)) || 198 ($name == 'METHODRESPONSE' && ($accept & self::ACCEPT_RESPONSE)) || 199 ($name == 'VALUE' && ($accept & self::ACCEPT_VALUE)) || 200 ($name == 'FAULT' && ($accept & self::ACCEPT_FAULT))) { 201 $this->_xh['rt'] = strtolower($name); 202 } else { 203 $this->_xh['isf'] = 2; 204 $this->_xh['isf_reason'] = 'missing top level xmlrpc element. Found: ' . $name; 205 206 return; 207 } 208 } else { 209 // not top level element: see if parent is OK 210 $parent = end($this->_xh['stack']); 211 if (!array_key_exists($name, $this->xmlrpc_valid_parents) || !in_array($parent, $this->xmlrpc_valid_parents[$name])) { 212 $this->_xh['isf'] = 2; 213 $this->_xh['isf_reason'] = "xmlrpc element $name cannot be child of $parent"; 214 215 return; 216 } 217 } 218 219 switch ($name) { 220 // optimize for speed switch cases: most common cases first 221 case 'VALUE': 222 /// @todo we could check for 2 VALUE elements inside a MEMBER or PARAM element 223 $this->_xh['vt'] = 'value'; // indicator: no value found yet 224 $this->_xh['ac'] = ''; 225 $this->_xh['lv'] = 1; 226 $this->_xh['php_class'] = null; 227 break; 228 case 'I8': 229 case 'EX:I8': 230 if (PHP_INT_SIZE === 4) { 231 // INVALID ELEMENT: RAISE ISF so that it is later recognized!!! 232 $this->_xh['isf'] = 2; 233 $this->_xh['isf_reason'] = "Received i8 element but php is compiled in 32 bit mode"; 234 235 return; 236 } 237 // fall through voluntarily 238 case 'I4': 239 case 'INT': 240 case 'STRING': 241 case 'BOOLEAN': 242 case 'DOUBLE': 243 case 'DATETIME.ISO8601': 244 case 'BASE64': 245 if ($this->_xh['vt'] != 'value') { 246 // two data elements inside a value: an error occurred! 247 $this->_xh['isf'] = 2; 248 $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value"; 249 250 return; 251 } 252 $this->_xh['ac'] = ''; // reset the accumulator 253 break; 254 case 'STRUCT': 255 case 'ARRAY': 256 if ($this->_xh['vt'] != 'value') { 257 // two data elements inside a value: an error occurred! 258 $this->_xh['isf'] = 2; 259 $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value"; 260 261 return; 262 } 263 // create an empty array to hold child values, and push it onto appropriate stack 264 $curVal = array(); 265 $curVal['values'] = array(); 266 $curVal['type'] = $name; 267 // check for out-of-band information to rebuild php objs 268 // and in case it is found, save it 269 if (@isset($attrs['PHP_CLASS'])) { 270 $curVal['php_class'] = $attrs['PHP_CLASS']; 271 } 272 $this->_xh['valuestack'][] = $curVal; 273 $this->_xh['vt'] = 'data'; // be prepared for a data element next 274 break; 275 case 'DATA': 276 if ($this->_xh['vt'] != 'data') { 277 // two data elements inside a value: an error occurred! 278 $this->_xh['isf'] = 2; 279 $this->_xh['isf_reason'] = "found two data elements inside an array element"; 280 281 return; 282 } 283 case 'METHODCALL': 284 case 'METHODRESPONSE': 285 case 'PARAMS': 286 // valid elements that add little to processing 287 break; 288 case 'METHODNAME': 289 case 'NAME': 290 /// @todo we could check for 2 NAME elements inside a MEMBER element 291 $this->_xh['ac'] = ''; 292 break; 293 case 'FAULT': 294 $this->_xh['isf'] = 1; 295 break; 296 case 'MEMBER': 297 // set member name to null, in case we do not find in the xml later on 298 $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = ''; 299 //$this->_xh['ac']=''; 300 // Drop trough intentionally 301 case 'PARAM': 302 // clear value type, so we can check later if no value has been passed for this param/member 303 $this->_xh['vt'] = null; 304 break; 305 case 'NIL': 306 case 'EX:NIL': 307 if (PhpXmlRpc::$xmlrpc_null_extension) { 308 if ($this->_xh['vt'] != 'value') { 309 // two data elements inside a value: an error occurred! 310 $this->_xh['isf'] = 2; 311 $this->_xh['isf_reason'] = "$name element following a {$this->_xh['vt']} element inside a single value"; 312 313 return; 314 } 315 $this->_xh['ac'] = ''; // reset the accumulator 316 break; 317 } 318 // if here, we do not support the <NIL/> extension, so 319 // drop through intentionally 320 default: 321 // INVALID ELEMENT: RAISE ISF so that it is later recognized!!! 322 $this->_xh['isf'] = 2; 323 $this->_xh['isf_reason'] = "found not-xmlrpc xml element $name"; 324 break; 325 } 326 327 // Save current element name to stack, to validate nesting 328 $this->_xh['stack'][] = $name; 329 330 /// @todo optimization creep: move this inside the big switch() above 331 if ($name != 'VALUE') { 332 $this->_xh['lv'] = 0; 333 } 334 } 335 } 336 337 /** 338 * xml parser handler function for opening element tags. 339 * Used in decoding xml chunks that might represent single xmlrpc values as well as requests, responses. 340 * @deprecated 341 * @param resource $parser 342 * @param $name 343 * @param $attrs 344 */ 345 public function xmlrpc_se_any($parser, $name, $attrs) 346 { 347 $this->xmlrpc_se($parser, $name, $attrs, true); 348 } 349 350 /** 351 * xml parser handler function for close element tags. 352 * @internal 353 * @param resource $parser 354 * @param string $name 355 * @param int $rebuildXmlrpcvals >1 for rebuilding xmlrpcvals, 0 for rebuilding php values, -1 for xmlrpc-extension compatibility 356 */ 357 public function xmlrpc_ee($parser, $name, $rebuildXmlrpcvals = 1) 358 { 359 if ($this->_xh['isf'] < 2) { 360 // push this element name from stack 361 // NB: if XML validates, correct opening/closing is guaranteed and 362 // we do not have to check for $name == $currElem. 363 // we also checked for proper nesting at start of elements... 364 $currElem = array_pop($this->_xh['stack']); 365 366 switch ($name) { 367 case 'VALUE': 368 // This if() detects if no scalar was inside <VALUE></VALUE> 369 if ($this->_xh['vt'] == 'value') { 370 $this->_xh['value'] = $this->_xh['ac']; 371 $this->_xh['vt'] = Value::$xmlrpcString; 372 } 373 374 if ($rebuildXmlrpcvals > 0) { 375 // build the xmlrpc val out of the data received, and substitute it 376 $temp = new Value($this->_xh['value'], $this->_xh['vt']); 377 // in case we got info about underlying php class, save it 378 // in the object we're rebuilding 379 if (isset($this->_xh['php_class'])) { 380 $temp->_php_class = $this->_xh['php_class']; 381 } 382 $this->_xh['value'] = $temp; 383 } elseif ($rebuildXmlrpcvals < 0) { 384 if ($this->_xh['vt'] == Value::$xmlrpcDateTime) { 385 $this->_xh['value'] = (object)array( 386 'xmlrpc_type' => 'datetime', 387 'scalar' => $this->_xh['value'], 388 'timestamp' => \PhpXmlRpc\Helper\Date::iso8601Decode($this->_xh['value']) 389 ); 390 } elseif ($this->_xh['vt'] == Value::$xmlrpcBase64) { 391 $this->_xh['value'] = (object)array( 392 'xmlrpc_type' => 'base64', 393 'scalar' => $this->_xh['value'] 394 ); 395 } 396 } else { 397 /// @todo this should handle php-serialized objects, 398 /// since std deserializing is done by php_xmlrpc_decode, 399 /// which we will not be calling... 400 //if (isset($this->_xh['php_class'])) { 401 //} 402 } 403 404 // check if we are inside an array or struct: 405 // if value just built is inside an array, let's move it into array on the stack 406 $vscount = count($this->_xh['valuestack']); 407 if ($vscount && $this->_xh['valuestack'][$vscount - 1]['type'] == 'ARRAY') { 408 $this->_xh['valuestack'][$vscount - 1]['values'][] = $this->_xh['value']; 409 } 410 break; 411 case 'BOOLEAN': 412 case 'I4': 413 case 'I8': 414 case 'EX:I8': 415 case 'INT': 416 case 'STRING': 417 case 'DOUBLE': 418 case 'DATETIME.ISO8601': 419 case 'BASE64': 420 $this->_xh['vt'] = strtolower($name); 421 /// @todo: optimization creep - remove the if/elseif cycle below 422 /// since the case() in which we are already did that 423 if ($name == 'STRING') { 424 $this->_xh['value'] = $this->_xh['ac']; 425 } elseif ($name == 'DATETIME.ISO8601') { 426 if (!preg_match('/^[0-9]{8}T[0-9]{2}:[0-9]{2}:[0-9]{2}$/', $this->_xh['ac'])) { 427 Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': invalid value received in DATETIME: ' . $this->_xh['ac']); 428 } 429 $this->_xh['vt'] = Value::$xmlrpcDateTime; 430 $this->_xh['value'] = $this->_xh['ac']; 431 } elseif ($name == 'BASE64') { 432 /// @todo check for failure of base64 decoding / catch warnings 433 $this->_xh['value'] = base64_decode($this->_xh['ac']); 434 } elseif ($name == 'BOOLEAN') { 435 // special case here: we translate boolean 1 or 0 into PHP 436 // constants true or false. 437 // Strings 'true' and 'false' are accepted, even though the 438 // spec never mentions them (see eg. Blogger api docs) 439 // NB: this simple checks helps a lot sanitizing input, ie no 440 // security problems around here 441 if ($this->_xh['ac'] == '1' || strcasecmp($this->_xh['ac'], 'true') == 0) { 442 $this->_xh['value'] = true; 443 } else { 444 // log if receiving something strange, even though we set the value to false anyway 445 if ($this->_xh['ac'] != '0' && strcasecmp($this->_xh['ac'], 'false') != 0) { 446 Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': invalid value received in BOOLEAN: ' . $this->_xh['ac']); 447 } 448 $this->_xh['value'] = false; 449 } 450 } elseif ($name == 'DOUBLE') { 451 // we have a DOUBLE 452 // we must check that only 0123456789-.<space> are characters here 453 // NOTE: regexp could be much stricter than this... 454 if (!preg_match('/^[+-eE0123456789 \t.]+$/', $this->_xh['ac'])) { 455 /// @todo: find a better way of throwing an error than this! 456 Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': non numeric value received in DOUBLE: ' . $this->_xh['ac']); 457 $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND'; 458 } else { 459 // it's ok, add it on 460 $this->_xh['value'] = (double)$this->_xh['ac']; 461 } 462 } else { 463 // we have an I4/I8/INT 464 // we must check that only 0123456789-<space> are characters here 465 if (!preg_match('/^[+-]?[0123456789 \t]+$/', $this->_xh['ac'])) { 466 /// @todo find a better way of throwing an error than this! 467 Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': non numeric value received in INT: ' . $this->_xh['ac']); 468 $this->_xh['value'] = 'ERROR_NON_NUMERIC_FOUND'; 469 } else { 470 // it's ok, add it on 471 $this->_xh['value'] = (int)$this->_xh['ac']; 472 } 473 } 474 $this->_xh['lv'] = 3; // indicate we've found a value 475 break; 476 case 'NAME': 477 $this->_xh['valuestack'][count($this->_xh['valuestack']) - 1]['name'] = $this->_xh['ac']; 478 break; 479 case 'MEMBER': 480 // add to array in the stack the last element built, 481 // unless no VALUE was found 482 if ($this->_xh['vt']) { 483 $vscount = count($this->_xh['valuestack']); 484 $this->_xh['valuestack'][$vscount - 1]['values'][$this->_xh['valuestack'][$vscount - 1]['name']] = $this->_xh['value']; 485 } else { 486 Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': missing VALUE inside STRUCT in received xml'); 487 } 488 break; 489 case 'DATA': 490 $this->_xh['vt'] = null; // reset this to check for 2 data elements in a row - even if they're empty 491 break; 492 case 'STRUCT': 493 case 'ARRAY': 494 // fetch out of stack array of values, and promote it to current value 495 $currVal = array_pop($this->_xh['valuestack']); 496 $this->_xh['value'] = $currVal['values']; 497 $this->_xh['vt'] = strtolower($name); 498 if (isset($currVal['php_class'])) { 499 $this->_xh['php_class'] = $currVal['php_class']; 500 } 501 break; 502 case 'PARAM': 503 // add to array of params the current value, 504 // unless no VALUE was found 505 if ($this->_xh['vt']) { 506 $this->_xh['params'][] = $this->_xh['value']; 507 $this->_xh['pt'][] = $this->_xh['vt']; 508 } else { 509 Logger::instance()->errorLog('XML-RPC: ' . __METHOD__ . ': missing VALUE inside PARAM in received xml'); 510 } 511 break; 512 case 'METHODNAME': 513 $this->_xh['method'] = preg_replace('/^[\n\r\t ]+/', '', $this->_xh['ac']); 514 break; 515 case 'NIL': 516 case 'EX:NIL': 517 if (PhpXmlRpc::$xmlrpc_null_extension) { 518 $this->_xh['vt'] = 'null'; 519 $this->_xh['value'] = null; 520 $this->_xh['lv'] = 3; 521 break; 522 } 523 // drop through intentionally if nil extension not enabled 524 case 'PARAMS': 525 case 'FAULT': 526 case 'METHODCALL': 527 case 'METHORESPONSE': 528 break; 529 default: 530 // End of INVALID ELEMENT! 531 // shall we add an assert here for unreachable code??? 532 break; 533 } 534 } 535 } 536 537 /** 538 * Used in decoding xmlrpc requests/responses without rebuilding xmlrpc Values. 539 * @internal 540 * @param resource $parser 541 * @param string $name 542 */ 543 public function xmlrpc_ee_fast($parser, $name) 544 { 545 $this->xmlrpc_ee($parser, $name, 0); 546 } 547 548 /** 549 * Used in decoding xmlrpc requests/responses while building xmlrpc-extension Values (plain php for all but base64 and datetime). 550 * @internal 551 * @param resource $parser 552 * @param string $name 553 */ 554 public function xmlrpc_ee_epi($parser, $name) 555 { 556 $this->xmlrpc_ee($parser, $name, -1); 557 } 558 559 /** 560 * xml parser handler function for character data. 561 * @internal 562 * @param resource $parser 563 * @param string $data 564 */ 565 public function xmlrpc_cd($parser, $data) 566 { 567 // skip processing if xml fault already detected 568 if ($this->_xh['isf'] < 2) { 569 // "lookforvalue==3" means that we've found an entire value 570 // and should discard any further character data 571 if ($this->_xh['lv'] != 3) { 572 $this->_xh['ac'] .= $data; 573 } 574 } 575 } 576 577 /** 578 * xml parser handler function for 'other stuff', ie. not char data or 579 * element start/end tag. In fact it only gets called on unknown entities... 580 * @internal 581 * @param $parser 582 * @param string data 583 */ 584 public function xmlrpc_dh($parser, $data) 585 { 586 // skip processing if xml fault already detected 587 if ($this->_xh['isf'] < 2) { 588 if (substr($data, 0, 1) == '&' && substr($data, -1, 1) == ';') { 589 $this->_xh['ac'] .= $data; 590 } 591 } 592 593 //return true; 594 } 595 596 /** 597 * xml charset encoding guessing helper function. 598 * Tries to determine the charset encoding of an XML chunk received over HTTP. 599 * NB: according to the spec (RFC 3023), if text/xml content-type is received over HTTP without a content-type, 600 * we SHOULD assume it is strictly US-ASCII. But we try to be more tolerant of non conforming (legacy?) clients/servers, 601 * which will be most probably using UTF-8 anyway... 602 * In order of importance checks: 603 * 1. http headers 604 * 2. BOM 605 * 3. XML declaration 606 * 4. guesses using mb_detect_encoding() 607 * 608 * @param string $httpHeader the http Content-type header 609 * @param string $xmlChunk xml content buffer 610 * @param string $encodingPrefs comma separated list of character encodings to be used as default (when mb extension is enabled). 611 * This can also be set globally using PhpXmlRpc::$xmlrpc_detectencodings 612 * @return string the encoding determined. Null if it can't be determined and mbstring is enabled, 613 * PhpXmlRpc::$xmlrpc_defencoding if it can't be determined and mbstring is not enabled 614 * 615 * @todo explore usage of mb_http_input(): does it detect http headers + post data? if so, use it instead of hand-detection!!! 616 */ 617 public static function guessEncoding($httpHeader = '', $xmlChunk = '', $encodingPrefs = null) 618 { 619 // discussion: see http://www.yale.edu/pclt/encoding/ 620 // 1 - test if encoding is specified in HTTP HEADERS 621 622 // Details: 623 // LWS: (\13\10)?( |\t)+ 624 // token: (any char but excluded stuff)+ 625 // quoted string: " (any char but double quotes and control chars)* " 626 // header: Content-type = ...; charset=value(; ...)* 627 // where value is of type token, no LWS allowed between 'charset' and value 628 // Note: we do not check for invalid chars in VALUE: 629 // this had better be done using pure ereg as below 630 // Note 2: we might be removing whitespace/tabs that ought to be left in if 631 // the received charset is a quoted string. But nobody uses such charset names... 632 633 /// @todo this test will pass if ANY header has charset specification, not only Content-Type. Fix it? 634 $matches = array(); 635 if (preg_match('/;\s*charset\s*=([^;]+)/i', $httpHeader, $matches)) { 636 return strtoupper(trim($matches[1], " \t\"")); 637 } 638 639 // 2 - scan the first bytes of the data for a UTF-16 (or other) BOM pattern 640 // (source: http://www.w3.org/TR/2000/REC-xml-20001006) 641 // NOTE: actually, according to the spec, even if we find the BOM and determine 642 // an encoding, we should check if there is an encoding specified 643 // in the xml declaration, and verify if they match. 644 /// @todo implement check as described above? 645 /// @todo implement check for first bytes of string even without a BOM? (It sure looks harder than for cases WITH a BOM) 646 if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { 647 return 'UCS-4'; 648 } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { 649 return 'UTF-16'; 650 } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { 651 return 'UTF-8'; 652 } 653 654 // 3 - test if encoding is specified in the xml declaration 655 // Details: 656 // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ 657 // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* 658 if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . 659 '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", 660 $xmlChunk, $matches)) { 661 return strtoupper(substr($matches[2], 1, -1)); 662 } 663 664 // 4 - if mbstring is available, let it do the guesswork 665 if (extension_loaded('mbstring')) { 666 if ($encodingPrefs == null && PhpXmlRpc::$xmlrpc_detectencodings != null) { 667 $encodingPrefs = PhpXmlRpc::$xmlrpc_detectencodings; 668 } 669 if ($encodingPrefs) { 670 $enc = mb_detect_encoding($xmlChunk, $encodingPrefs); 671 } else { 672 $enc = mb_detect_encoding($xmlChunk); 673 } 674 // NB: mb_detect likes to call it ascii, xml parser likes to call it US_ASCII... 675 // IANA also likes better US-ASCII, so go with it 676 if ($enc == 'ASCII') { 677 $enc = 'US-' . $enc; 678 } 679 680 return $enc; 681 } else { 682 // no encoding specified: as per HTTP1.1 assume it is iso-8859-1? 683 // Both RFC 2616 (HTTP 1.1) and 1945 (HTTP 1.0) clearly state that for text/xxx content types 684 // this should be the standard. And we should be getting text/xml as request and response. 685 // BUT we have to be backward compatible with the lib, which always used UTF-8 as default... 686 return PhpXmlRpc::$xmlrpc_defencoding; 687 } 688 } 689 690 /** 691 * Helper function: checks if an xml chunk as a charset declaration (BOM or in the xml declaration) 692 * 693 * @param string $xmlChunk 694 * @return bool 695 */ 696 public static function hasEncoding($xmlChunk) 697 { 698 // scan the first bytes of the data for a UTF-16 (or other) BOM pattern 699 // (source: http://www.w3.org/TR/2000/REC-xml-20001006) 700 if (preg_match('/^(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00|\x00\x00\xFF\xFE|\xFE\xFF\x00\x00)/', $xmlChunk)) { 701 return true; 702 } elseif (preg_match('/^(\xFE\xFF|\xFF\xFE)/', $xmlChunk)) { 703 return true; 704 } elseif (preg_match('/^(\xEF\xBB\xBF)/', $xmlChunk)) { 705 return true; 706 } 707 708 // test if encoding is specified in the xml declaration 709 // Details: 710 // SPACE: (#x20 | #x9 | #xD | #xA)+ === [ \x9\xD\xA]+ 711 // EQ: SPACE?=SPACE? === [ \x9\xD\xA]*=[ \x9\xD\xA]* 712 if (preg_match('/^<\?xml\s+version\s*=\s*' . "((?:\"[a-zA-Z0-9_.:-]+\")|(?:'[a-zA-Z0-9_.:-]+'))" . 713 '\s+encoding\s*=\s*' . "((?:\"[A-Za-z][A-Za-z0-9._-]*\")|(?:'[A-Za-z][A-Za-z0-9._-]*'))/", 714 $xmlChunk, $matches)) { 715 return true; 716 } 717 718 return false; 719 } 720 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body