Differences Between: [Versions 311 and 401] [Versions 311 and 402] [Versions 311 and 403]
1 <?php 2 3 /** 4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. 5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. 6 * 7 * @note 8 * Recent changes to PHP's DOM extension have resulted in some fatal 9 * error conditions with the original version of PH5P. Pending changes, 10 * this lexer will punt to DirectLex if DOM throws an exception. 11 */ 12 13 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex 14 { 15 /** 16 * @param string $html 17 * @param HTMLPurifier_Config $config 18 * @param HTMLPurifier_Context $context 19 * @return HTMLPurifier_Token[] 20 */ 21 public function tokenizeHTML($html, $config, $context) 22 { 23 $new_html = $this->normalize($html, $config, $context); 24 $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */); 25 try { 26 $parser = new HTML5($new_html); 27 $doc = $parser->save(); 28 } catch (DOMException $e) { 29 // Uh oh, it failed. Punt to DirectLex. 30 $lexer = new HTMLPurifier_Lexer_DirectLex(); 31 $context->register('PH5PError', $e); // save the error, so we can detect it 32 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML 33 } 34 $tokens = array(); 35 $this->tokenizeDOM( 36 $doc->getElementsByTagName('html')->item(0)-> // <html> 37 getElementsByTagName('body')->item(0) // <body> 38 , 39 $tokens, $config 40 ); 41 return $tokens; 42 } 43 } 44 45 /* 46 47 Copyright 2007 Jeroen van der Meer <http://jero.net/> 48 49 Permission is hereby granted, free of charge, to any person obtaining a 50 copy of this software and associated documentation files (the 51 "Software"), to deal in the Software without restriction, including 52 without limitation the rights to use, copy, modify, merge, publish, 53 distribute, sublicense, and/or sell copies of the Software, and to 54 permit persons to whom the Software is furnished to do so, subject to 55 the following conditions: 56 57 The above copyright notice and this permission notice shall be included 58 in all copies or substantial portions of the Software. 59 60 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 61 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 62 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 63 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 64 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 65 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 66 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 67 68 */ 69 70 class HTML5 71 { 72 private $data; 73 private $char; 74 private $EOF; 75 private $state; 76 private $tree; 77 private $token; 78 private $content_model; 79 private $escape = false; 80 private $entities = array( 81 'AElig;', 82 'AElig', 83 'AMP;', 84 'AMP', 85 'Aacute;', 86 'Aacute', 87 'Acirc;', 88 'Acirc', 89 'Agrave;', 90 'Agrave', 91 'Alpha;', 92 'Aring;', 93 'Aring', 94 'Atilde;', 95 'Atilde', 96 'Auml;', 97 'Auml', 98 'Beta;', 99 'COPY;', 100 'COPY', 101 'Ccedil;', 102 'Ccedil', 103 'Chi;', 104 'Dagger;', 105 'Delta;', 106 'ETH;', 107 'ETH', 108 'Eacute;', 109 'Eacute', 110 'Ecirc;', 111 'Ecirc', 112 'Egrave;', 113 'Egrave', 114 'Epsilon;', 115 'Eta;', 116 'Euml;', 117 'Euml', 118 'GT;', 119 'GT', 120 'Gamma;', 121 'Iacute;', 122 'Iacute', 123 'Icirc;', 124 'Icirc', 125 'Igrave;', 126 'Igrave', 127 'Iota;', 128 'Iuml;', 129 'Iuml', 130 'Kappa;', 131 'LT;', 132 'LT', 133 'Lambda;', 134 'Mu;', 135 'Ntilde;', 136 'Ntilde', 137 'Nu;', 138 'OElig;', 139 'Oacute;', 140 'Oacute', 141 'Ocirc;', 142 'Ocirc', 143 'Ograve;', 144 'Ograve', 145 'Omega;', 146 'Omicron;', 147 'Oslash;', 148 'Oslash', 149 'Otilde;', 150 'Otilde', 151 'Ouml;', 152 'Ouml', 153 'Phi;', 154 'Pi;', 155 'Prime;', 156 'Psi;', 157 'QUOT;', 158 'QUOT', 159 'REG;', 160 'REG', 161 'Rho;', 162 'Scaron;', 163 'Sigma;', 164 'THORN;', 165 'THORN', 166 'TRADE;', 167 'Tau;', 168 'Theta;', 169 'Uacute;', 170 'Uacute', 171 'Ucirc;', 172 'Ucirc', 173 'Ugrave;', 174 'Ugrave', 175 'Upsilon;', 176 'Uuml;', 177 'Uuml', 178 'Xi;', 179 'Yacute;', 180 'Yacute', 181 'Yuml;', 182 'Zeta;', 183 'aacute;', 184 'aacute', 185 'acirc;', 186 'acirc', 187 'acute;', 188 'acute', 189 'aelig;', 190 'aelig', 191 'agrave;', 192 'agrave', 193 'alefsym;', 194 'alpha;', 195 'amp;', 196 'amp', 197 'and;', 198 'ang;', 199 'apos;', 200 'aring;', 201 'aring', 202 'asymp;', 203 'atilde;', 204 'atilde', 205 'auml;', 206 'auml', 207 'bdquo;', 208 'beta;', 209 'brvbar;', 210 'brvbar', 211 'bull;', 212 'cap;', 213 'ccedil;', 214 'ccedil', 215 'cedil;', 216 'cedil', 217 'cent;', 218 'cent', 219 'chi;', 220 'circ;', 221 'clubs;', 222 'cong;', 223 'copy;', 224 'copy', 225 'crarr;', 226 'cup;', 227 'curren;', 228 'curren', 229 'dArr;', 230 'dagger;', 231 'darr;', 232 'deg;', 233 'deg', 234 'delta;', 235 'diams;', 236 'divide;', 237 'divide', 238 'eacute;', 239 'eacute', 240 'ecirc;', 241 'ecirc', 242 'egrave;', 243 'egrave', 244 'empty;', 245 'emsp;', 246 'ensp;', 247 'epsilon;', 248 'equiv;', 249 'eta;', 250 'eth;', 251 'eth', 252 'euml;', 253 'euml', 254 'euro;', 255 'exist;', 256 'fnof;', 257 'forall;', 258 'frac12;', 259 'frac12', 260 'frac14;', 261 'frac14', 262 'frac34;', 263 'frac34', 264 'frasl;', 265 'gamma;', 266 'ge;', 267 'gt;', 268 'gt', 269 'hArr;', 270 'harr;', 271 'hearts;', 272 'hellip;', 273 'iacute;', 274 'iacute', 275 'icirc;', 276 'icirc', 277 'iexcl;', 278 'iexcl', 279 'igrave;', 280 'igrave', 281 'image;', 282 'infin;', 283 'int;', 284 'iota;', 285 'iquest;', 286 'iquest', 287 'isin;', 288 'iuml;', 289 'iuml', 290 'kappa;', 291 'lArr;', 292 'lambda;', 293 'lang;', 294 'laquo;', 295 'laquo', 296 'larr;', 297 'lceil;', 298 'ldquo;', 299 'le;', 300 'lfloor;', 301 'lowast;', 302 'loz;', 303 'lrm;', 304 'lsaquo;', 305 'lsquo;', 306 'lt;', 307 'lt', 308 'macr;', 309 'macr', 310 'mdash;', 311 'micro;', 312 'micro', 313 'middot;', 314 'middot', 315 'minus;', 316 'mu;', 317 'nabla;', 318 'nbsp;', 319 'nbsp', 320 'ndash;', 321 'ne;', 322 'ni;', 323 'not;', 324 'not', 325 'notin;', 326 'nsub;', 327 'ntilde;', 328 'ntilde', 329 'nu;', 330 'oacute;', 331 'oacute', 332 'ocirc;', 333 'ocirc', 334 'oelig;', 335 'ograve;', 336 'ograve', 337 'oline;', 338 'omega;', 339 'omicron;', 340 'oplus;', 341 'or;', 342 'ordf;', 343 'ordf', 344 'ordm;', 345 'ordm', 346 'oslash;', 347 'oslash', 348 'otilde;', 349 'otilde', 350 'otimes;', 351 'ouml;', 352 'ouml', 353 'para;', 354 'para', 355 'part;', 356 'permil;', 357 'perp;', 358 'phi;', 359 'pi;', 360 'piv;', 361 'plusmn;', 362 'plusmn', 363 'pound;', 364 'pound', 365 'prime;', 366 'prod;', 367 'prop;', 368 'psi;', 369 'quot;', 370 'quot', 371 'rArr;', 372 'radic;', 373 'rang;', 374 'raquo;', 375 'raquo', 376 'rarr;', 377 'rceil;', 378 'rdquo;', 379 'real;', 380 'reg;', 381 'reg', 382 'rfloor;', 383 'rho;', 384 'rlm;', 385 'rsaquo;', 386 'rsquo;', 387 'sbquo;', 388 'scaron;', 389 'sdot;', 390 'sect;', 391 'sect', 392 'shy;', 393 'shy', 394 'sigma;', 395 'sigmaf;', 396 'sim;', 397 'spades;', 398 'sub;', 399 'sube;', 400 'sum;', 401 'sup1;', 402 'sup1', 403 'sup2;', 404 'sup2', 405 'sup3;', 406 'sup3', 407 'sup;', 408 'supe;', 409 'szlig;', 410 'szlig', 411 'tau;', 412 'there4;', 413 'theta;', 414 'thetasym;', 415 'thinsp;', 416 'thorn;', 417 'thorn', 418 'tilde;', 419 'times;', 420 'times', 421 'trade;', 422 'uArr;', 423 'uacute;', 424 'uacute', 425 'uarr;', 426 'ucirc;', 427 'ucirc', 428 'ugrave;', 429 'ugrave', 430 'uml;', 431 'uml', 432 'upsih;', 433 'upsilon;', 434 'uuml;', 435 'uuml', 436 'weierp;', 437 'xi;', 438 'yacute;', 439 'yacute', 440 'yen;', 441 'yen', 442 'yuml;', 443 'yuml', 444 'zeta;', 445 'zwj;', 446 'zwnj;' 447 ); 448 449 const PCDATA = 0; 450 const RCDATA = 1; 451 const CDATA = 2; 452 const PLAINTEXT = 3; 453 454 const DOCTYPE = 0; 455 const STARTTAG = 1; 456 const ENDTAG = 2; 457 const COMMENT = 3; 458 const CHARACTR = 4; 459 const EOF = 5; 460 461 public function __construct($data) 462 { 463 $this->data = $data; 464 $this->char = -1; 465 $this->EOF = strlen($data); 466 $this->tree = new HTML5TreeConstructer; 467 $this->content_model = self::PCDATA; 468 469 $this->state = 'data'; 470 471 while ($this->state !== null) { 472 $this->{$this->state . 'State'}(); 473 } 474 } 475 476 public function save() 477 { 478 return $this->tree->save(); 479 } 480 481 private function char() 482 { 483 return ($this->char < $this->EOF) 484 ? $this->data[$this->char] 485 : false; 486 } 487 488 private function character($s, $l = 0) 489 { 490 if ($s + $l < $this->EOF) { 491 if ($l === 0) { 492 return $this->data[$s]; 493 } else { 494 return substr($this->data, $s, $l); 495 } 496 } 497 } 498 499 private function characters($char_class, $start) 500 { 501 return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start)); 502 } 503 504 private function dataState() 505 { 506 // Consume the next input character 507 $this->char++; 508 $char = $this->char(); 509 510 if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { 511 /* U+0026 AMPERSAND (&) 512 When the content model flag is set to one of the PCDATA or RCDATA 513 states: switch to the entity data state. Otherwise: treat it as per 514 the "anything else" entry below. */ 515 $this->state = 'entityData'; 516 517 } elseif ($char === '-') { 518 /* If the content model flag is set to either the RCDATA state or 519 the CDATA state, and the escape flag is false, and there are at 520 least three characters before this one in the input stream, and the 521 last four characters in the input stream, including this one, are 522 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, 523 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ 524 if (($this->content_model === self::RCDATA || $this->content_model === 525 self::CDATA) && $this->escape === false && 526 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--' 527 ) { 528 $this->escape = true; 529 } 530 531 /* In any case, emit the input character as a character token. Stay 532 in the data state. */ 533 $this->emitToken( 534 array( 535 'type' => self::CHARACTR, 536 'data' => $char 537 ) 538 ); 539 540 /* U+003C LESS-THAN SIGN (<) */ 541 } elseif ($char === '<' && ($this->content_model === self::PCDATA || 542 (($this->content_model === self::RCDATA || 543 $this->content_model === self::CDATA) && $this->escape === false)) 544 ) { 545 /* When the content model flag is set to the PCDATA state: switch 546 to the tag open state. 547 548 When the content model flag is set to either the RCDATA state or 549 the CDATA state and the escape flag is false: switch to the tag 550 open state. 551 552 Otherwise: treat it as per the "anything else" entry below. */ 553 $this->state = 'tagOpen'; 554 555 /* U+003E GREATER-THAN SIGN (>) */ 556 } elseif ($char === '>') { 557 /* If the content model flag is set to either the RCDATA state or 558 the CDATA state, and the escape flag is true, and the last three 559 characters in the input stream including this one are U+002D 560 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), 561 set the escape flag to false. */ 562 if (($this->content_model === self::RCDATA || 563 $this->content_model === self::CDATA) && $this->escape === true && 564 $this->character($this->char, 3) === '-->' 565 ) { 566 $this->escape = false; 567 } 568 569 /* In any case, emit the input character as a character token. 570 Stay in the data state. */ 571 $this->emitToken( 572 array( 573 'type' => self::CHARACTR, 574 'data' => $char 575 ) 576 ); 577 578 } elseif ($this->char === $this->EOF) { 579 /* EOF 580 Emit an end-of-file token. */ 581 $this->EOF(); 582 583 } elseif ($this->content_model === self::PLAINTEXT) { 584 /* When the content model flag is set to the PLAINTEXT state 585 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of 586 the text and emit it as a character token. */ 587 $this->emitToken( 588 array( 589 'type' => self::CHARACTR, 590 'data' => substr($this->data, $this->char) 591 ) 592 ); 593 594 $this->EOF(); 595 596 } else { 597 /* Anything else 598 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that 599 otherwise would also be treated as a character token and emit it 600 as a single character token. Stay in the data state. */ 601 $len = strcspn($this->data, '<&', $this->char); 602 $char = substr($this->data, $this->char, $len); 603 $this->char += $len - 1; 604 605 $this->emitToken( 606 array( 607 'type' => self::CHARACTR, 608 'data' => $char 609 ) 610 ); 611 612 $this->state = 'data'; 613 } 614 } 615 616 private function entityDataState() 617 { 618 // Attempt to consume an entity. 619 $entity = $this->entity(); 620 621 // If nothing is returned, emit a U+0026 AMPERSAND character token. 622 // Otherwise, emit the character token that was returned. 623 $char = (!$entity) ? '&' : $entity; 624 $this->emitToken( 625 array( 626 'type' => self::CHARACTR, 627 'data' => $char 628 ) 629 ); 630 631 // Finally, switch to the data state. 632 $this->state = 'data'; 633 } 634 635 private function tagOpenState() 636 { 637 switch ($this->content_model) { 638 case self::RCDATA: 639 case self::CDATA: 640 /* If the next input character is a U+002F SOLIDUS (/) character, 641 consume it and switch to the close tag open state. If the next 642 input character is not a U+002F SOLIDUS (/) character, emit a 643 U+003C LESS-THAN SIGN character token and switch to the data 644 state to process the next input character. */ 645 if ($this->character($this->char + 1) === '/') { 646 $this->char++; 647 $this->state = 'closeTagOpen'; 648 649 } else { 650 $this->emitToken( 651 array( 652 'type' => self::CHARACTR, 653 'data' => '<' 654 ) 655 ); 656 657 $this->state = 'data'; 658 } 659 break; 660 661 case self::PCDATA: 662 // If the content model flag is set to the PCDATA state 663 // Consume the next input character: 664 $this->char++; 665 $char = $this->char(); 666 667 if ($char === '!') { 668 /* U+0021 EXCLAMATION MARK (!) 669 Switch to the markup declaration open state. */ 670 $this->state = 'markupDeclarationOpen'; 671 672 } elseif ($char === '/') { 673 /* U+002F SOLIDUS (/) 674 Switch to the close tag open state. */ 675 $this->state = 'closeTagOpen'; 676 677 } elseif (preg_match('/^[A-Za-z]$/', $char)) { 678 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 679 Create a new start tag token, set its tag name to the lowercase 680 version of the input character (add 0x0020 to the character's code 681 point), then switch to the tag name state. (Don't emit the token 682 yet; further details will be filled in before it is emitted.) */ 683 $this->token = array( 684 'name' => strtolower($char), 685 'type' => self::STARTTAG, 686 'attr' => array() 687 ); 688 689 $this->state = 'tagName'; 690 691 } elseif ($char === '>') { 692 /* U+003E GREATER-THAN SIGN (>) 693 Parse error. Emit a U+003C LESS-THAN SIGN character token and a 694 U+003E GREATER-THAN SIGN character token. Switch to the data state. */ 695 $this->emitToken( 696 array( 697 'type' => self::CHARACTR, 698 'data' => '<>' 699 ) 700 ); 701 702 $this->state = 'data'; 703 704 } elseif ($char === '?') { 705 /* U+003F QUESTION MARK (?) 706 Parse error. Switch to the bogus comment state. */ 707 $this->state = 'bogusComment'; 708 709 } else { 710 /* Anything else 711 Parse error. Emit a U+003C LESS-THAN SIGN character token and 712 reconsume the current input character in the data state. */ 713 $this->emitToken( 714 array( 715 'type' => self::CHARACTR, 716 'data' => '<' 717 ) 718 ); 719 720 $this->char--; 721 $this->state = 'data'; 722 } 723 break; 724 } 725 } 726 727 private function closeTagOpenState() 728 { 729 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); 730 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; 731 732 if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && 733 (!$the_same || ($the_same && (!preg_match( 734 '/[\t\n\x0b\x0c >\/]/', 735 $this->character($this->char + 1 + strlen($next_node)) 736 ) || $this->EOF === $this->char))) 737 ) { 738 /* If the content model flag is set to the RCDATA or CDATA states then 739 examine the next few characters. If they do not match the tag name of 740 the last start tag token emitted (case insensitively), or if they do but 741 they are not immediately followed by one of the following characters: 742 * U+0009 CHARACTER TABULATION 743 * U+000A LINE FEED (LF) 744 * U+000B LINE TABULATION 745 * U+000C FORM FEED (FF) 746 * U+0020 SPACE 747 * U+003E GREATER-THAN SIGN (>) 748 * U+002F SOLIDUS (/) 749 * EOF 750 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character 751 token, a U+002F SOLIDUS character token, and switch to the data state 752 to process the next input character. */ 753 $this->emitToken( 754 array( 755 'type' => self::CHARACTR, 756 'data' => '</' 757 ) 758 ); 759 760 $this->state = 'data'; 761 762 } else { 763 /* Otherwise, if the content model flag is set to the PCDATA state, 764 or if the next few characters do match that tag name, consume the 765 next input character: */ 766 $this->char++; 767 $char = $this->char(); 768 769 if (preg_match('/^[A-Za-z]$/', $char)) { 770 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 771 Create a new end tag token, set its tag name to the lowercase version 772 of the input character (add 0x0020 to the character's code point), then 773 switch to the tag name state. (Don't emit the token yet; further details 774 will be filled in before it is emitted.) */ 775 $this->token = array( 776 'name' => strtolower($char), 777 'type' => self::ENDTAG 778 ); 779 780 $this->state = 'tagName'; 781 782 } elseif ($char === '>') { 783 /* U+003E GREATER-THAN SIGN (>) 784 Parse error. Switch to the data state. */ 785 $this->state = 'data'; 786 787 } elseif ($this->char === $this->EOF) { 788 /* EOF 789 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F 790 SOLIDUS character token. Reconsume the EOF character in the data state. */ 791 $this->emitToken( 792 array( 793 'type' => self::CHARACTR, 794 'data' => '</' 795 ) 796 ); 797 798 $this->char--; 799 $this->state = 'data'; 800 801 } else { 802 /* Parse error. Switch to the bogus comment state. */ 803 $this->state = 'bogusComment'; 804 } 805 } 806 } 807 808 private function tagNameState() 809 { 810 // Consume the next input character: 811 $this->char++; 812 $char = $this->character($this->char); 813 814 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 815 /* U+0009 CHARACTER TABULATION 816 U+000A LINE FEED (LF) 817 U+000B LINE TABULATION 818 U+000C FORM FEED (FF) 819 U+0020 SPACE 820 Switch to the before attribute name state. */ 821 $this->state = 'beforeAttributeName'; 822 823 } elseif ($char === '>') { 824 /* U+003E GREATER-THAN SIGN (>) 825 Emit the current tag token. Switch to the data state. */ 826 $this->emitToken($this->token); 827 $this->state = 'data'; 828 829 } elseif ($this->char === $this->EOF) { 830 /* EOF 831 Parse error. Emit the current tag token. Reconsume the EOF 832 character in the data state. */ 833 $this->emitToken($this->token); 834 835 $this->char--; 836 $this->state = 'data'; 837 838 } elseif ($char === '/') { 839 /* U+002F SOLIDUS (/) 840 Parse error unless this is a permitted slash. Switch to the before 841 attribute name state. */ 842 $this->state = 'beforeAttributeName'; 843 844 } else { 845 /* Anything else 846 Append the current input character to the current tag token's tag name. 847 Stay in the tag name state. */ 848 $this->token['name'] .= strtolower($char); 849 $this->state = 'tagName'; 850 } 851 } 852 853 private function beforeAttributeNameState() 854 { 855 // Consume the next input character: 856 $this->char++; 857 $char = $this->character($this->char); 858 859 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 860 /* U+0009 CHARACTER TABULATION 861 U+000A LINE FEED (LF) 862 U+000B LINE TABULATION 863 U+000C FORM FEED (FF) 864 U+0020 SPACE 865 Stay in the before attribute name state. */ 866 $this->state = 'beforeAttributeName'; 867 868 } elseif ($char === '>') { 869 /* U+003E GREATER-THAN SIGN (>) 870 Emit the current tag token. Switch to the data state. */ 871 $this->emitToken($this->token); 872 $this->state = 'data'; 873 874 } elseif ($char === '/') { 875 /* U+002F SOLIDUS (/) 876 Parse error unless this is a permitted slash. Stay in the before 877 attribute name state. */ 878 $this->state = 'beforeAttributeName'; 879 880 } elseif ($this->char === $this->EOF) { 881 /* EOF 882 Parse error. Emit the current tag token. Reconsume the EOF 883 character in the data state. */ 884 $this->emitToken($this->token); 885 886 $this->char--; 887 $this->state = 'data'; 888 889 } else { 890 /* Anything else 891 Start a new attribute in the current tag token. Set that attribute's 892 name to the current input character, and its value to the empty string. 893 Switch to the attribute name state. */ 894 $this->token['attr'][] = array( 895 'name' => strtolower($char), 896 'value' => null 897 ); 898 899 $this->state = 'attributeName'; 900 } 901 } 902 903 private function attributeNameState() 904 { 905 // Consume the next input character: 906 $this->char++; 907 $char = $this->character($this->char); 908 909 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 910 /* U+0009 CHARACTER TABULATION 911 U+000A LINE FEED (LF) 912 U+000B LINE TABULATION 913 U+000C FORM FEED (FF) 914 U+0020 SPACE 915 Stay in the before attribute name state. */ 916 $this->state = 'afterAttributeName'; 917 918 } elseif ($char === '=') { 919 /* U+003D EQUALS SIGN (=) 920 Switch to the before attribute value state. */ 921 $this->state = 'beforeAttributeValue'; 922 923 } elseif ($char === '>') { 924 /* U+003E GREATER-THAN SIGN (>) 925 Emit the current tag token. Switch to the data state. */ 926 $this->emitToken($this->token); 927 $this->state = 'data'; 928 929 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { 930 /* U+002F SOLIDUS (/) 931 Parse error unless this is a permitted slash. Switch to the before 932 attribute name state. */ 933 $this->state = 'beforeAttributeName'; 934 935 } elseif ($this->char === $this->EOF) { 936 /* EOF 937 Parse error. Emit the current tag token. Reconsume the EOF 938 character in the data state. */ 939 $this->emitToken($this->token); 940 941 $this->char--; 942 $this->state = 'data'; 943 944 } else { 945 /* Anything else 946 Append the current input character to the current attribute's name. 947 Stay in the attribute name state. */ 948 $last = count($this->token['attr']) - 1; 949 $this->token['attr'][$last]['name'] .= strtolower($char); 950 951 $this->state = 'attributeName'; 952 } 953 } 954 955 private function afterAttributeNameState() 956 { 957 // Consume the next input character: 958 $this->char++; 959 $char = $this->character($this->char); 960 961 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 962 /* U+0009 CHARACTER TABULATION 963 U+000A LINE FEED (LF) 964 U+000B LINE TABULATION 965 U+000C FORM FEED (FF) 966 U+0020 SPACE 967 Stay in the after attribute name state. */ 968 $this->state = 'afterAttributeName'; 969 970 } elseif ($char === '=') { 971 /* U+003D EQUALS SIGN (=) 972 Switch to the before attribute value state. */ 973 $this->state = 'beforeAttributeValue'; 974 975 } elseif ($char === '>') { 976 /* U+003E GREATER-THAN SIGN (>) 977 Emit the current tag token. Switch to the data state. */ 978 $this->emitToken($this->token); 979 $this->state = 'data'; 980 981 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { 982 /* U+002F SOLIDUS (/) 983 Parse error unless this is a permitted slash. Switch to the 984 before attribute name state. */ 985 $this->state = 'beforeAttributeName'; 986 987 } elseif ($this->char === $this->EOF) { 988 /* EOF 989 Parse error. Emit the current tag token. Reconsume the EOF 990 character in the data state. */ 991 $this->emitToken($this->token); 992 993 $this->char--; 994 $this->state = 'data'; 995 996 } else { 997 /* Anything else 998 Start a new attribute in the current tag token. Set that attribute's 999 name to the current input character, and its value to the empty string. 1000 Switch to the attribute name state. */ 1001 $this->token['attr'][] = array( 1002 'name' => strtolower($char), 1003 'value' => null 1004 ); 1005 1006 $this->state = 'attributeName'; 1007 } 1008 } 1009 1010 private function beforeAttributeValueState() 1011 { 1012 // Consume the next input character: 1013 $this->char++; 1014 $char = $this->character($this->char); 1015 1016 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1017 /* U+0009 CHARACTER TABULATION 1018 U+000A LINE FEED (LF) 1019 U+000B LINE TABULATION 1020 U+000C FORM FEED (FF) 1021 U+0020 SPACE 1022 Stay in the before attribute value state. */ 1023 $this->state = 'beforeAttributeValue'; 1024 1025 } elseif ($char === '"') { 1026 /* U+0022 QUOTATION MARK (") 1027 Switch to the attribute value (double-quoted) state. */ 1028 $this->state = 'attributeValueDoubleQuoted'; 1029 1030 } elseif ($char === '&') { 1031 /* U+0026 AMPERSAND (&) 1032 Switch to the attribute value (unquoted) state and reconsume 1033 this input character. */ 1034 $this->char--; 1035 $this->state = 'attributeValueUnquoted'; 1036 1037 } elseif ($char === '\'') { 1038 /* U+0027 APOSTROPHE (') 1039 Switch to the attribute value (single-quoted) state. */ 1040 $this->state = 'attributeValueSingleQuoted'; 1041 1042 } elseif ($char === '>') { 1043 /* U+003E GREATER-THAN SIGN (>) 1044 Emit the current tag token. Switch to the data state. */ 1045 $this->emitToken($this->token); 1046 $this->state = 'data'; 1047 1048 } else { 1049 /* Anything else 1050 Append the current input character to the current attribute's value. 1051 Switch to the attribute value (unquoted) state. */ 1052 $last = count($this->token['attr']) - 1; 1053 $this->token['attr'][$last]['value'] .= $char; 1054 1055 $this->state = 'attributeValueUnquoted'; 1056 } 1057 } 1058 1059 private function attributeValueDoubleQuotedState() 1060 { 1061 // Consume the next input character: 1062 $this->char++; 1063 $char = $this->character($this->char); 1064 1065 if ($char === '"') { 1066 /* U+0022 QUOTATION MARK (") 1067 Switch to the before attribute name state. */ 1068 $this->state = 'beforeAttributeName'; 1069 1070 } elseif ($char === '&') { 1071 /* U+0026 AMPERSAND (&) 1072 Switch to the entity in attribute value state. */ 1073 $this->entityInAttributeValueState('double'); 1074 1075 } elseif ($this->char === $this->EOF) { 1076 /* EOF 1077 Parse error. Emit the current tag token. Reconsume the character 1078 in the data state. */ 1079 $this->emitToken($this->token); 1080 1081 $this->char--; 1082 $this->state = 'data'; 1083 1084 } else { 1085 /* Anything else 1086 Append the current input character to the current attribute's value. 1087 Stay in the attribute value (double-quoted) state. */ 1088 $last = count($this->token['attr']) - 1; 1089 $this->token['attr'][$last]['value'] .= $char; 1090 1091 $this->state = 'attributeValueDoubleQuoted'; 1092 } 1093 } 1094 1095 private function attributeValueSingleQuotedState() 1096 { 1097 // Consume the next input character: 1098 $this->char++; 1099 $char = $this->character($this->char); 1100 1101 if ($char === '\'') { 1102 /* U+0022 QUOTATION MARK (') 1103 Switch to the before attribute name state. */ 1104 $this->state = 'beforeAttributeName'; 1105 1106 } elseif ($char === '&') { 1107 /* U+0026 AMPERSAND (&) 1108 Switch to the entity in attribute value state. */ 1109 $this->entityInAttributeValueState('single'); 1110 1111 } elseif ($this->char === $this->EOF) { 1112 /* EOF 1113 Parse error. Emit the current tag token. Reconsume the character 1114 in the data state. */ 1115 $this->emitToken($this->token); 1116 1117 $this->char--; 1118 $this->state = 'data'; 1119 1120 } else { 1121 /* Anything else 1122 Append the current input character to the current attribute's value. 1123 Stay in the attribute value (single-quoted) state. */ 1124 $last = count($this->token['attr']) - 1; 1125 $this->token['attr'][$last]['value'] .= $char; 1126 1127 $this->state = 'attributeValueSingleQuoted'; 1128 } 1129 } 1130 1131 private function attributeValueUnquotedState() 1132 { 1133 // Consume the next input character: 1134 $this->char++; 1135 $char = $this->character($this->char); 1136 1137 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1138 /* U+0009 CHARACTER TABULATION 1139 U+000A LINE FEED (LF) 1140 U+000B LINE TABULATION 1141 U+000C FORM FEED (FF) 1142 U+0020 SPACE 1143 Switch to the before attribute name state. */ 1144 $this->state = 'beforeAttributeName'; 1145 1146 } elseif ($char === '&') { 1147 /* U+0026 AMPERSAND (&) 1148 Switch to the entity in attribute value state. */ 1149 $this->entityInAttributeValueState(); 1150 1151 } elseif ($char === '>') { 1152 /* U+003E GREATER-THAN SIGN (>) 1153 Emit the current tag token. Switch to the data state. */ 1154 $this->emitToken($this->token); 1155 $this->state = 'data'; 1156 1157 } else { 1158 /* Anything else 1159 Append the current input character to the current attribute's value. 1160 Stay in the attribute value (unquoted) state. */ 1161 $last = count($this->token['attr']) - 1; 1162 $this->token['attr'][$last]['value'] .= $char; 1163 1164 $this->state = 'attributeValueUnquoted'; 1165 } 1166 } 1167 1168 private function entityInAttributeValueState() 1169 { 1170 // Attempt to consume an entity. 1171 $entity = $this->entity(); 1172 1173 // If nothing is returned, append a U+0026 AMPERSAND character to the 1174 // current attribute's value. Otherwise, emit the character token that 1175 // was returned. 1176 $char = (!$entity) 1177 ? '&' 1178 : $entity; 1179 1180 $last = count($this->token['attr']) - 1; 1181 $this->token['attr'][$last]['value'] .= $char; 1182 } 1183 1184 private function bogusCommentState() 1185 { 1186 /* Consume every character up to the first U+003E GREATER-THAN SIGN 1187 character (>) or the end of the file (EOF), whichever comes first. Emit 1188 a comment token whose data is the concatenation of all the characters 1189 starting from and including the character that caused the state machine 1190 to switch into the bogus comment state, up to and including the last 1191 consumed character before the U+003E character, if any, or up to the 1192 end of the file otherwise. (If the comment was started by the end of 1193 the file (EOF), the token is empty.) */ 1194 $data = $this->characters('^>', $this->char); 1195 $this->emitToken( 1196 array( 1197 'data' => $data, 1198 'type' => self::COMMENT 1199 ) 1200 ); 1201 1202 $this->char += strlen($data); 1203 1204 /* Switch to the data state. */ 1205 $this->state = 'data'; 1206 1207 /* If the end of the file was reached, reconsume the EOF character. */ 1208 if ($this->char === $this->EOF) { 1209 $this->char = $this->EOF - 1; 1210 } 1211 } 1212 1213 private function markupDeclarationOpenState() 1214 { 1215 /* If the next two characters are both U+002D HYPHEN-MINUS (-) 1216 characters, consume those two characters, create a comment token whose 1217 data is the empty string, and switch to the comment state. */ 1218 if ($this->character($this->char + 1, 2) === '--') { 1219 $this->char += 2; 1220 $this->state = 'comment'; 1221 $this->token = array( 1222 'data' => null, 1223 'type' => self::COMMENT 1224 ); 1225 1226 /* Otherwise if the next seven chacacters are a case-insensitive match 1227 for the word "DOCTYPE", then consume those characters and switch to the 1228 DOCTYPE state. */ 1229 } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') { 1230 $this->char += 7; 1231 $this->state = 'doctype'; 1232 1233 /* Otherwise, is is a parse error. Switch to the bogus comment state. 1234 The next character that is consumed, if any, is the first character 1235 that will be in the comment. */ 1236 } else { 1237 $this->char++; 1238 $this->state = 'bogusComment'; 1239 } 1240 } 1241 1242 private function commentState() 1243 { 1244 /* Consume the next input character: */ 1245 $this->char++; 1246 $char = $this->char(); 1247 1248 /* U+002D HYPHEN-MINUS (-) */ 1249 if ($char === '-') { 1250 /* Switch to the comment dash state */ 1251 $this->state = 'commentDash'; 1252 1253 /* EOF */ 1254 } elseif ($this->char === $this->EOF) { 1255 /* Parse error. Emit the comment token. Reconsume the EOF character 1256 in the data state. */ 1257 $this->emitToken($this->token); 1258 $this->char--; 1259 $this->state = 'data'; 1260 1261 /* Anything else */ 1262 } else { 1263 /* Append the input character to the comment token's data. Stay in 1264 the comment state. */ 1265 $this->token['data'] .= $char; 1266 } 1267 } 1268 1269 private function commentDashState() 1270 { 1271 /* Consume the next input character: */ 1272 $this->char++; 1273 $char = $this->char(); 1274 1275 /* U+002D HYPHEN-MINUS (-) */ 1276 if ($char === '-') { 1277 /* Switch to the comment end state */ 1278 $this->state = 'commentEnd'; 1279 1280 /* EOF */ 1281 } elseif ($this->char === $this->EOF) { 1282 /* Parse error. Emit the comment token. Reconsume the EOF character 1283 in the data state. */ 1284 $this->emitToken($this->token); 1285 $this->char--; 1286 $this->state = 'data'; 1287 1288 /* Anything else */ 1289 } else { 1290 /* Append a U+002D HYPHEN-MINUS (-) character and the input 1291 character to the comment token's data. Switch to the comment state. */ 1292 $this->token['data'] .= '-' . $char; 1293 $this->state = 'comment'; 1294 } 1295 } 1296 1297 private function commentEndState() 1298 { 1299 /* Consume the next input character: */ 1300 $this->char++; 1301 $char = $this->char(); 1302 1303 if ($char === '>') { 1304 $this->emitToken($this->token); 1305 $this->state = 'data'; 1306 1307 } elseif ($char === '-') { 1308 $this->token['data'] .= '-'; 1309 1310 } elseif ($this->char === $this->EOF) { 1311 $this->emitToken($this->token); 1312 $this->char--; 1313 $this->state = 'data'; 1314 1315 } else { 1316 $this->token['data'] .= '--' . $char; 1317 $this->state = 'comment'; 1318 } 1319 } 1320 1321 private function doctypeState() 1322 { 1323 /* Consume the next input character: */ 1324 $this->char++; 1325 $char = $this->char(); 1326 1327 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1328 $this->state = 'beforeDoctypeName'; 1329 1330 } else { 1331 $this->char--; 1332 $this->state = 'beforeDoctypeName'; 1333 } 1334 } 1335 1336 private function beforeDoctypeNameState() 1337 { 1338 /* Consume the next input character: */ 1339 $this->char++; 1340 $char = $this->char(); 1341 1342 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1343 // Stay in the before DOCTYPE name state. 1344 1345 } elseif (preg_match('/^[a-z]$/', $char)) { 1346 $this->token = array( 1347 'name' => strtoupper($char), 1348 'type' => self::DOCTYPE, 1349 'error' => true 1350 ); 1351 1352 $this->state = 'doctypeName'; 1353 1354 } elseif ($char === '>') { 1355 $this->emitToken( 1356 array( 1357 'name' => null, 1358 'type' => self::DOCTYPE, 1359 'error' => true 1360 ) 1361 ); 1362 1363 $this->state = 'data'; 1364 1365 } elseif ($this->char === $this->EOF) { 1366 $this->emitToken( 1367 array( 1368 'name' => null, 1369 'type' => self::DOCTYPE, 1370 'error' => true 1371 ) 1372 ); 1373 1374 $this->char--; 1375 $this->state = 'data'; 1376 1377 } else { 1378 $this->token = array( 1379 'name' => $char, 1380 'type' => self::DOCTYPE, 1381 'error' => true 1382 ); 1383 1384 $this->state = 'doctypeName'; 1385 } 1386 } 1387 1388 private function doctypeNameState() 1389 { 1390 /* Consume the next input character: */ 1391 $this->char++; 1392 $char = $this->char(); 1393 1394 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1395 $this->state = 'AfterDoctypeName'; 1396 1397 } elseif ($char === '>') { 1398 $this->emitToken($this->token); 1399 $this->state = 'data'; 1400 1401 } elseif (preg_match('/^[a-z]$/', $char)) { 1402 $this->token['name'] .= strtoupper($char); 1403 1404 } elseif ($this->char === $this->EOF) { 1405 $this->emitToken($this->token); 1406 $this->char--; 1407 $this->state = 'data'; 1408 1409 } else { 1410 $this->token['name'] .= $char; 1411 } 1412 1413 $this->token['error'] = ($this->token['name'] === 'HTML') 1414 ? false 1415 : true; 1416 } 1417 1418 private function afterDoctypeNameState() 1419 { 1420 /* Consume the next input character: */ 1421 $this->char++; 1422 $char = $this->char(); 1423 1424 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1425 // Stay in the DOCTYPE name state. 1426 1427 } elseif ($char === '>') { 1428 $this->emitToken($this->token); 1429 $this->state = 'data'; 1430 1431 } elseif ($this->char === $this->EOF) { 1432 $this->emitToken($this->token); 1433 $this->char--; 1434 $this->state = 'data'; 1435 1436 } else { 1437 $this->token['error'] = true; 1438 $this->state = 'bogusDoctype'; 1439 } 1440 } 1441 1442 private function bogusDoctypeState() 1443 { 1444 /* Consume the next input character: */ 1445 $this->char++; 1446 $char = $this->char(); 1447 1448 if ($char === '>') { 1449 $this->emitToken($this->token); 1450 $this->state = 'data'; 1451 1452 } elseif ($this->char === $this->EOF) { 1453 $this->emitToken($this->token); 1454 $this->char--; 1455 $this->state = 'data'; 1456 1457 } else { 1458 // Stay in the bogus DOCTYPE state. 1459 } 1460 } 1461 1462 private function entity() 1463 { 1464 $start = $this->char; 1465 1466 // This section defines how to consume an entity. This definition is 1467 // used when parsing entities in text and in attributes. 1468 1469 // The behaviour depends on the identity of the next character (the 1470 // one immediately after the U+0026 AMPERSAND character): 1471 1472 switch ($this->character($this->char + 1)) { 1473 // U+0023 NUMBER SIGN (#) 1474 case '#': 1475 1476 // The behaviour further depends on the character after the 1477 // U+0023 NUMBER SIGN: 1478 switch ($this->character($this->char + 1)) { 1479 // U+0078 LATIN SMALL LETTER X 1480 // U+0058 LATIN CAPITAL LETTER X 1481 case 'x': 1482 case 'X': 1483 // Follow the steps below, but using the range of 1484 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1485 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 1486 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER 1487 // A, through to U+0046 LATIN CAPITAL LETTER F (in other 1488 // words, 0-9, A-F, a-f). 1489 $char = 1; 1490 $char_class = '0-9A-Fa-f'; 1491 break; 1492 1493 // Anything else 1494 default: 1495 // Follow the steps below, but using the range of 1496 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1497 // NINE (i.e. just 0-9). 1498 $char = 0; 1499 $char_class = '0-9'; 1500 break; 1501 } 1502 1503 // Consume as many characters as match the range of characters 1504 // given above. 1505 $this->char++; 1506 $e_name = $this->characters($char_class, $this->char + $char + 1); 1507 $entity = $this->character($start, $this->char); 1508 $cond = strlen($e_name) > 0; 1509 1510 // The rest of the parsing happens below. 1511 break; 1512 1513 // Anything else 1514 default: 1515 // Consume the maximum number of characters possible, with the 1516 // consumed characters case-sensitively matching one of the 1517 // identifiers in the first column of the entities table. 1518 1519 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); 1520 $len = strlen($e_name); 1521 1522 for ($c = 1; $c <= $len; $c++) { 1523 $id = substr($e_name, 0, $c); 1524 $this->char++; 1525 1526 if (in_array($id, $this->entities)) { 1527 if ($e_name[$c - 1] !== ';') { 1528 if ($c < $len && $e_name[$c] == ';') { 1529 $this->char++; // consume extra semicolon 1530 } 1531 } 1532 $entity = $id; 1533 break; 1534 } 1535 } 1536 1537 $cond = isset($entity); 1538 // The rest of the parsing happens below. 1539 break; 1540 } 1541 1542 if (!$cond) { 1543 // If no match can be made, then this is a parse error. No 1544 // characters are consumed, and nothing is returned. 1545 $this->char = $start; 1546 return false; 1547 } 1548 1549 // Return a character token for the character corresponding to the 1550 // entity name (as given by the second column of the entities table). 1551 return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8'); 1552 } 1553 1554 private function emitToken($token) 1555 { 1556 $emit = $this->tree->emitToken($token); 1557 1558 if (is_int($emit)) { 1559 $this->content_model = $emit; 1560 1561 } elseif ($token['type'] === self::ENDTAG) { 1562 $this->content_model = self::PCDATA; 1563 } 1564 } 1565 1566 private function EOF() 1567 { 1568 $this->state = null; 1569 $this->tree->emitToken( 1570 array( 1571 'type' => self::EOF 1572 ) 1573 ); 1574 } 1575 } 1576 1577 class HTML5TreeConstructer 1578 { 1579 public $stack = array(); 1580 1581 private $phase; 1582 private $mode; 1583 private $dom; 1584 private $foster_parent = null; 1585 private $a_formatting = array(); 1586 1587 private $head_pointer = null; 1588 private $form_pointer = null; 1589 1590 private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th'); 1591 private $formatting = array( 1592 'a', 1593 'b', 1594 'big', 1595 'em', 1596 'font', 1597 'i', 1598 'nobr', 1599 's', 1600 'small', 1601 'strike', 1602 'strong', 1603 'tt', 1604 'u' 1605 ); 1606 private $special = array( 1607 'address', 1608 'area', 1609 'base', 1610 'basefont', 1611 'bgsound', 1612 'blockquote', 1613 'body', 1614 'br', 1615 'center', 1616 'col', 1617 'colgroup', 1618 'dd', 1619 'dir', 1620 'div', 1621 'dl', 1622 'dt', 1623 'embed', 1624 'fieldset', 1625 'form', 1626 'frame', 1627 'frameset', 1628 'h1', 1629 'h2', 1630 'h3', 1631 'h4', 1632 'h5', 1633 'h6', 1634 'head', 1635 'hr', 1636 'iframe', 1637 'image', 1638 'img', 1639 'input', 1640 'isindex', 1641 'li', 1642 'link', 1643 'listing', 1644 'menu', 1645 'meta', 1646 'noembed', 1647 'noframes', 1648 'noscript', 1649 'ol', 1650 'optgroup', 1651 'option', 1652 'p', 1653 'param', 1654 'plaintext', 1655 'pre', 1656 'script', 1657 'select', 1658 'spacer', 1659 'style', 1660 'tbody', 1661 'textarea', 1662 'tfoot', 1663 'thead', 1664 'title', 1665 'tr', 1666 'ul', 1667 'wbr' 1668 ); 1669 1670 // The different phases. 1671 const INIT_PHASE = 0; 1672 const ROOT_PHASE = 1; 1673 const MAIN_PHASE = 2; 1674 const END_PHASE = 3; 1675 1676 // The different insertion modes for the main phase. 1677 const BEFOR_HEAD = 0; 1678 const IN_HEAD = 1; 1679 const AFTER_HEAD = 2; 1680 const IN_BODY = 3; 1681 const IN_TABLE = 4; 1682 const IN_CAPTION = 5; 1683 const IN_CGROUP = 6; 1684 const IN_TBODY = 7; 1685 const IN_ROW = 8; 1686 const IN_CELL = 9; 1687 const IN_SELECT = 10; 1688 const AFTER_BODY = 11; 1689 const IN_FRAME = 12; 1690 const AFTR_FRAME = 13; 1691 1692 // The different types of elements. 1693 const SPECIAL = 0; 1694 const SCOPING = 1; 1695 const FORMATTING = 2; 1696 const PHRASING = 3; 1697 1698 const MARKER = 0; 1699 1700 public function __construct() 1701 { 1702 $this->phase = self::INIT_PHASE; 1703 $this->mode = self::BEFOR_HEAD; 1704 $this->dom = new DOMDocument; 1705 1706 $this->dom->encoding = 'UTF-8'; 1707 $this->dom->preserveWhiteSpace = true; 1708 $this->dom->substituteEntities = true; 1709 $this->dom->strictErrorChecking = false; 1710 } 1711 1712 // Process tag tokens 1713 public function emitToken($token) 1714 { 1715 switch ($this->phase) { 1716 case self::INIT_PHASE: 1717 return $this->initPhase($token); 1718 break; 1719 case self::ROOT_PHASE: 1720 return $this->rootElementPhase($token); 1721 break; 1722 case self::MAIN_PHASE: 1723 return $this->mainPhase($token); 1724 break; 1725 case self::END_PHASE : 1726 return $this->trailingEndPhase($token); 1727 break; 1728 } 1729 } 1730 1731 private function initPhase($token) 1732 { 1733 /* Initially, the tree construction stage must handle each token 1734 emitted from the tokenisation stage as follows: */ 1735 1736 /* A DOCTYPE token that is marked as being in error 1737 A comment token 1738 A start tag token 1739 An end tag token 1740 A character token that is not one of one of U+0009 CHARACTER TABULATION, 1741 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1742 or U+0020 SPACE 1743 An end-of-file token */ 1744 if ((isset($token['error']) && $token['error']) || 1745 $token['type'] === HTML5::COMMENT || 1746 $token['type'] === HTML5::STARTTAG || 1747 $token['type'] === HTML5::ENDTAG || 1748 $token['type'] === HTML5::EOF || 1749 ($token['type'] === HTML5::CHARACTR && isset($token['data']) && 1750 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) 1751 ) { 1752 /* This specification does not define how to handle this case. In 1753 particular, user agents may ignore the entirety of this specification 1754 altogether for such documents, and instead invoke special parse modes 1755 with a greater emphasis on backwards compatibility. */ 1756 1757 $this->phase = self::ROOT_PHASE; 1758 return $this->rootElementPhase($token); 1759 1760 /* A DOCTYPE token marked as being correct */ 1761 } elseif (isset($token['error']) && !$token['error']) { 1762 /* Append a DocumentType node to the Document node, with the name 1763 attribute set to the name given in the DOCTYPE token (which will be 1764 "HTML"), and the other attributes specific to DocumentType objects 1765 set to null, empty lists, or the empty string as appropriate. */ 1766 $doctype = new DOMDocumentType(null, null, 'HTML'); 1767 1768 /* Then, switch to the root element phase of the tree construction 1769 stage. */ 1770 $this->phase = self::ROOT_PHASE; 1771 1772 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1773 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1774 or U+0020 SPACE */ 1775 } elseif (isset($token['data']) && preg_match( 1776 '/^[\t\n\x0b\x0c ]+$/', 1777 $token['data'] 1778 ) 1779 ) { 1780 /* Append that character to the Document node. */ 1781 $text = $this->dom->createTextNode($token['data']); 1782 $this->dom->appendChild($text); 1783 } 1784 } 1785 1786 private function rootElementPhase($token) 1787 { 1788 /* After the initial phase, as each token is emitted from the tokenisation 1789 stage, it must be processed as described in this section. */ 1790 1791 /* A DOCTYPE token */ 1792 if ($token['type'] === HTML5::DOCTYPE) { 1793 // Parse error. Ignore the token. 1794 1795 /* A comment token */ 1796 } elseif ($token['type'] === HTML5::COMMENT) { 1797 /* Append a Comment node to the Document object with the data 1798 attribute set to the data given in the comment token. */ 1799 $comment = $this->dom->createComment($token['data']); 1800 $this->dom->appendChild($comment); 1801 1802 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1803 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1804 or U+0020 SPACE */ 1805 } elseif ($token['type'] === HTML5::CHARACTR && 1806 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 1807 ) { 1808 /* Append that character to the Document node. */ 1809 $text = $this->dom->createTextNode($token['data']); 1810 $this->dom->appendChild($text); 1811 1812 /* A character token that is not one of U+0009 CHARACTER TABULATION, 1813 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED 1814 (FF), or U+0020 SPACE 1815 A start tag token 1816 An end tag token 1817 An end-of-file token */ 1818 } elseif (($token['type'] === HTML5::CHARACTR && 1819 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 1820 $token['type'] === HTML5::STARTTAG || 1821 $token['type'] === HTML5::ENDTAG || 1822 $token['type'] === HTML5::EOF 1823 ) { 1824 /* Create an HTMLElement node with the tag name html, in the HTML 1825 namespace. Append it to the Document object. Switch to the main 1826 phase and reprocess the current token. */ 1827 $html = $this->dom->createElement('html'); 1828 $this->dom->appendChild($html); 1829 $this->stack[] = $html; 1830 1831 $this->phase = self::MAIN_PHASE; 1832 return $this->mainPhase($token); 1833 } 1834 } 1835 1836 private function mainPhase($token) 1837 { 1838 /* Tokens in the main phase must be handled as follows: */ 1839 1840 /* A DOCTYPE token */ 1841 if ($token['type'] === HTML5::DOCTYPE) { 1842 // Parse error. Ignore the token. 1843 1844 /* A start tag token with the tag name "html" */ 1845 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { 1846 /* If this start tag token was not the first start tag token, then 1847 it is a parse error. */ 1848 1849 /* For each attribute on the token, check to see if the attribute 1850 is already present on the top element of the stack of open elements. 1851 If it is not, add the attribute and its corresponding value to that 1852 element. */ 1853 foreach ($token['attr'] as $attr) { 1854 if (!$this->stack[0]->hasAttribute($attr['name'])) { 1855 $this->stack[0]->setAttribute($attr['name'], $attr['value']); 1856 } 1857 } 1858 1859 /* An end-of-file token */ 1860 } elseif ($token['type'] === HTML5::EOF) { 1861 /* Generate implied end tags. */ 1862 $this->generateImpliedEndTags(); 1863 1864 /* Anything else. */ 1865 } else { 1866 /* Depends on the insertion mode: */ 1867 switch ($this->mode) { 1868 case self::BEFOR_HEAD: 1869 return $this->beforeHead($token); 1870 break; 1871 case self::IN_HEAD: 1872 return $this->inHead($token); 1873 break; 1874 case self::AFTER_HEAD: 1875 return $this->afterHead($token); 1876 break; 1877 case self::IN_BODY: 1878 return $this->inBody($token); 1879 break; 1880 case self::IN_TABLE: 1881 return $this->inTable($token); 1882 break; 1883 case self::IN_CAPTION: 1884 return $this->inCaption($token); 1885 break; 1886 case self::IN_CGROUP: 1887 return $this->inColumnGroup($token); 1888 break; 1889 case self::IN_TBODY: 1890 return $this->inTableBody($token); 1891 break; 1892 case self::IN_ROW: 1893 return $this->inRow($token); 1894 break; 1895 case self::IN_CELL: 1896 return $this->inCell($token); 1897 break; 1898 case self::IN_SELECT: 1899 return $this->inSelect($token); 1900 break; 1901 case self::AFTER_BODY: 1902 return $this->afterBody($token); 1903 break; 1904 case self::IN_FRAME: 1905 return $this->inFrameset($token); 1906 break; 1907 case self::AFTR_FRAME: 1908 return $this->afterFrameset($token); 1909 break; 1910 case self::END_PHASE: 1911 return $this->trailingEndPhase($token); 1912 break; 1913 } 1914 } 1915 } 1916 1917 private function beforeHead($token) 1918 { 1919 /* Handle the token as follows: */ 1920 1921 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1922 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1923 or U+0020 SPACE */ 1924 if ($token['type'] === HTML5::CHARACTR && 1925 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 1926 ) { 1927 /* Append the character to the current node. */ 1928 $this->insertText($token['data']); 1929 1930 /* A comment token */ 1931 } elseif ($token['type'] === HTML5::COMMENT) { 1932 /* Append a Comment node to the current node with the data attribute 1933 set to the data given in the comment token. */ 1934 $this->insertComment($token['data']); 1935 1936 /* A start tag token with the tag name "head" */ 1937 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { 1938 /* Create an element for the token, append the new element to the 1939 current node and push it onto the stack of open elements. */ 1940 $element = $this->insertElement($token); 1941 1942 /* Set the head element pointer to this new element node. */ 1943 $this->head_pointer = $element; 1944 1945 /* Change the insertion mode to "in head". */ 1946 $this->mode = self::IN_HEAD; 1947 1948 /* A start tag token whose tag name is one of: "base", "link", "meta", 1949 "script", "style", "title". Or an end tag with the tag name "html". 1950 Or a character token that is not one of U+0009 CHARACTER TABULATION, 1951 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1952 or U+0020 SPACE. Or any other start tag token */ 1953 } elseif ($token['type'] === HTML5::STARTTAG || 1954 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || 1955 ($token['type'] === HTML5::CHARACTR && !preg_match( 1956 '/^[\t\n\x0b\x0c ]$/', 1957 $token['data'] 1958 )) 1959 ) { 1960 /* Act as if a start tag token with the tag name "head" and no 1961 attributes had been seen, then reprocess the current token. */ 1962 $this->beforeHead( 1963 array( 1964 'name' => 'head', 1965 'type' => HTML5::STARTTAG, 1966 'attr' => array() 1967 ) 1968 ); 1969 1970 return $this->inHead($token); 1971 1972 /* Any other end tag */ 1973 } elseif ($token['type'] === HTML5::ENDTAG) { 1974 /* Parse error. Ignore the token. */ 1975 } 1976 } 1977 1978 private function inHead($token) 1979 { 1980 /* Handle the token as follows: */ 1981 1982 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1983 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1984 or U+0020 SPACE. 1985 1986 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style 1987 or script element, append the character to the current node regardless 1988 of its content. */ 1989 if (($token['type'] === HTML5::CHARACTR && 1990 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( 1991 $token['type'] === HTML5::CHARACTR && in_array( 1992 end($this->stack)->nodeName, 1993 array('title', 'style', 'script') 1994 )) 1995 ) { 1996 /* Append the character to the current node. */ 1997 $this->insertText($token['data']); 1998 1999 /* A comment token */ 2000 } elseif ($token['type'] === HTML5::COMMENT) { 2001 /* Append a Comment node to the current node with the data attribute 2002 set to the data given in the comment token. */ 2003 $this->insertComment($token['data']); 2004 2005 } elseif ($token['type'] === HTML5::ENDTAG && 2006 in_array($token['name'], array('title', 'style', 'script')) 2007 ) { 2008 array_pop($this->stack); 2009 return HTML5::PCDATA; 2010 2011 /* A start tag with the tag name "title" */ 2012 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { 2013 /* Create an element for the token and append the new element to the 2014 node pointed to by the head element pointer, or, if that is null 2015 (innerHTML case), to the current node. */ 2016 if ($this->head_pointer !== null) { 2017 $element = $this->insertElement($token, false); 2018 $this->head_pointer->appendChild($element); 2019 2020 } else { 2021 $element = $this->insertElement($token); 2022 } 2023 2024 /* Switch the tokeniser's content model flag to the RCDATA state. */ 2025 return HTML5::RCDATA; 2026 2027 /* A start tag with the tag name "style" */ 2028 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { 2029 /* Create an element for the token and append the new element to the 2030 node pointed to by the head element pointer, or, if that is null 2031 (innerHTML case), to the current node. */ 2032 if ($this->head_pointer !== null) { 2033 $element = $this->insertElement($token, false); 2034 $this->head_pointer->appendChild($element); 2035 2036 } else { 2037 $this->insertElement($token); 2038 } 2039 2040 /* Switch the tokeniser's content model flag to the CDATA state. */ 2041 return HTML5::CDATA; 2042 2043 /* A start tag with the tag name "script" */ 2044 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { 2045 /* Create an element for the token. */ 2046 $element = $this->insertElement($token, false); 2047 $this->head_pointer->appendChild($element); 2048 2049 /* Switch the tokeniser's content model flag to the CDATA state. */ 2050 return HTML5::CDATA; 2051 2052 /* A start tag with the tag name "base", "link", or "meta" */ 2053 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 2054 $token['name'], 2055 array('base', 'link', 'meta') 2056 ) 2057 ) { 2058 /* Create an element for the token and append the new element to the 2059 node pointed to by the head element pointer, or, if that is null 2060 (innerHTML case), to the current node. */ 2061 if ($this->head_pointer !== null) { 2062 $element = $this->insertElement($token, false); 2063 $this->head_pointer->appendChild($element); 2064 array_pop($this->stack); 2065 2066 } else { 2067 $this->insertElement($token); 2068 } 2069 2070 /* An end tag with the tag name "head" */ 2071 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { 2072 /* If the current node is a head element, pop the current node off 2073 the stack of open elements. */ 2074 if ($this->head_pointer->isSameNode(end($this->stack))) { 2075 array_pop($this->stack); 2076 2077 /* Otherwise, this is a parse error. */ 2078 } else { 2079 // k 2080 } 2081 2082 /* Change the insertion mode to "after head". */ 2083 $this->mode = self::AFTER_HEAD; 2084 2085 /* A start tag with the tag name "head" or an end tag except "html". */ 2086 } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || 2087 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html') 2088 ) { 2089 // Parse error. Ignore the token. 2090 2091 /* Anything else */ 2092 } else { 2093 /* If the current node is a head element, act as if an end tag 2094 token with the tag name "head" had been seen. */ 2095 if ($this->head_pointer->isSameNode(end($this->stack))) { 2096 $this->inHead( 2097 array( 2098 'name' => 'head', 2099 'type' => HTML5::ENDTAG 2100 ) 2101 ); 2102 2103 /* Otherwise, change the insertion mode to "after head". */ 2104 } else { 2105 $this->mode = self::AFTER_HEAD; 2106 } 2107 2108 /* Then, reprocess the current token. */ 2109 return $this->afterHead($token); 2110 } 2111 } 2112 2113 private function afterHead($token) 2114 { 2115 /* Handle the token as follows: */ 2116 2117 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 2118 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 2119 or U+0020 SPACE */ 2120 if ($token['type'] === HTML5::CHARACTR && 2121 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 2122 ) { 2123 /* Append the character to the current node. */ 2124 $this->insertText($token['data']); 2125 2126 /* A comment token */ 2127 } elseif ($token['type'] === HTML5::COMMENT) { 2128 /* Append a Comment node to the current node with the data attribute 2129 set to the data given in the comment token. */ 2130 $this->insertComment($token['data']); 2131 2132 /* A start tag token with the tag name "body" */ 2133 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { 2134 /* Insert a body element for the token. */ 2135 $this->insertElement($token); 2136 2137 /* Change the insertion mode to "in body". */ 2138 $this->mode = self::IN_BODY; 2139 2140 /* A start tag token with the tag name "frameset" */ 2141 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { 2142 /* Insert a frameset element for the token. */ 2143 $this->insertElement($token); 2144 2145 /* Change the insertion mode to "in frameset". */ 2146 $this->mode = self::IN_FRAME; 2147 2148 /* A start tag token whose tag name is one of: "base", "link", "meta", 2149 "script", "style", "title" */ 2150 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 2151 $token['name'], 2152 array('base', 'link', 'meta', 'script', 'style', 'title') 2153 ) 2154 ) { 2155 /* Parse error. Switch the insertion mode back to "in head" and 2156 reprocess the token. */ 2157 $this->mode = self::IN_HEAD; 2158 return $this->inHead($token); 2159 2160 /* Anything else */ 2161 } else { 2162 /* Act as if a start tag token with the tag name "body" and no 2163 attributes had been seen, and then reprocess the current token. */ 2164 $this->afterHead( 2165 array( 2166 'name' => 'body', 2167 'type' => HTML5::STARTTAG, 2168 'attr' => array() 2169 ) 2170 ); 2171 2172 return $this->inBody($token); 2173 } 2174 } 2175 2176 private function inBody($token) 2177 { 2178 /* Handle the token as follows: */ 2179 2180 switch ($token['type']) { 2181 /* A character token */ 2182 case HTML5::CHARACTR: 2183 /* Reconstruct the active formatting elements, if any. */ 2184 $this->reconstructActiveFormattingElements(); 2185 2186 /* Append the token's character to the current node. */ 2187 $this->insertText($token['data']); 2188 break; 2189 2190 /* A comment token */ 2191 case HTML5::COMMENT: 2192 /* Append a Comment node to the current node with the data 2193 attribute set to the data given in the comment token. */ 2194 $this->insertComment($token['data']); 2195 break; 2196 2197 case HTML5::STARTTAG: 2198 switch ($token['name']) { 2199 /* A start tag token whose tag name is one of: "script", 2200 "style" */ 2201 case 'script': 2202 case 'style': 2203 /* Process the token as if the insertion mode had been "in 2204 head". */ 2205 return $this->inHead($token); 2206 break; 2207 2208 /* A start tag token whose tag name is one of: "base", "link", 2209 "meta", "title" */ 2210 case 'base': 2211 case 'link': 2212 case 'meta': 2213 case 'title': 2214 /* Parse error. Process the token as if the insertion mode 2215 had been "in head". */ 2216 return $this->inHead($token); 2217 break; 2218 2219 /* A start tag token with the tag name "body" */ 2220 case 'body': 2221 /* Parse error. If the second element on the stack of open 2222 elements is not a body element, or, if the stack of open 2223 elements has only one node on it, then ignore the token. 2224 (innerHTML case) */ 2225 if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { 2226 // Ignore 2227 2228 /* Otherwise, for each attribute on the token, check to see 2229 if the attribute is already present on the body element (the 2230 second element) on the stack of open elements. If it is not, 2231 add the attribute and its corresponding value to that 2232 element. */ 2233 } else { 2234 foreach ($token['attr'] as $attr) { 2235 if (!$this->stack[1]->hasAttribute($attr['name'])) { 2236 $this->stack[1]->setAttribute($attr['name'], $attr['value']); 2237 } 2238 } 2239 } 2240 break; 2241 2242 /* A start tag whose tag name is one of: "address", 2243 "blockquote", "center", "dir", "div", "dl", "fieldset", 2244 "listing", "menu", "ol", "p", "ul" */ 2245 case 'address': 2246 case 'blockquote': 2247 case 'center': 2248 case 'dir': 2249 case 'div': 2250 case 'dl': 2251 case 'fieldset': 2252 case 'listing': 2253 case 'menu': 2254 case 'ol': 2255 case 'p': 2256 case 'ul': 2257 /* If the stack of open elements has a p element in scope, 2258 then act as if an end tag with the tag name p had been 2259 seen. */ 2260 if ($this->elementInScope('p')) { 2261 $this->emitToken( 2262 array( 2263 'name' => 'p', 2264 'type' => HTML5::ENDTAG 2265 ) 2266 ); 2267 } 2268 2269 /* Insert an HTML element for the token. */ 2270 $this->insertElement($token); 2271 break; 2272 2273 /* A start tag whose tag name is "form" */ 2274 case 'form': 2275 /* If the form element pointer is not null, ignore the 2276 token with a parse error. */ 2277 if ($this->form_pointer !== null) { 2278 // Ignore. 2279 2280 /* Otherwise: */ 2281 } else { 2282 /* If the stack of open elements has a p element in 2283 scope, then act as if an end tag with the tag name p 2284 had been seen. */ 2285 if ($this->elementInScope('p')) { 2286 $this->emitToken( 2287 array( 2288 'name' => 'p', 2289 'type' => HTML5::ENDTAG 2290 ) 2291 ); 2292 } 2293 2294 /* Insert an HTML element for the token, and set the 2295 form element pointer to point to the element created. */ 2296 $element = $this->insertElement($token); 2297 $this->form_pointer = $element; 2298 } 2299 break; 2300 2301 /* A start tag whose tag name is "li", "dd" or "dt" */ 2302 case 'li': 2303 case 'dd': 2304 case 'dt': 2305 /* If the stack of open elements has a p element in scope, 2306 then act as if an end tag with the tag name p had been 2307 seen. */ 2308 if ($this->elementInScope('p')) { 2309 $this->emitToken( 2310 array( 2311 'name' => 'p', 2312 'type' => HTML5::ENDTAG 2313 ) 2314 ); 2315 } 2316 2317 $stack_length = count($this->stack) - 1; 2318 2319 for ($n = $stack_length; 0 <= $n; $n--) { 2320 /* 1. Initialise node to be the current node (the 2321 bottommost node of the stack). */ 2322 $stop = false; 2323 $node = $this->stack[$n]; 2324 $cat = $this->getElementCategory($node->tagName); 2325 2326 /* 2. If node is an li, dd or dt element, then pop all 2327 the nodes from the current node up to node, including 2328 node, then stop this algorithm. */ 2329 if ($token['name'] === $node->tagName || ($token['name'] !== 'li' 2330 && ($node->tagName === 'dd' || $node->tagName === 'dt')) 2331 ) { 2332 for ($x = $stack_length; $x >= $n; $x--) { 2333 array_pop($this->stack); 2334 } 2335 2336 break; 2337 } 2338 2339 /* 3. If node is not in the formatting category, and is 2340 not in the phrasing category, and is not an address or 2341 div element, then stop this algorithm. */ 2342 if ($cat !== self::FORMATTING && $cat !== self::PHRASING && 2343 $node->tagName !== 'address' && $node->tagName !== 'div' 2344 ) { 2345 break; 2346 } 2347 } 2348 2349 /* Finally, insert an HTML element with the same tag 2350 name as the token's. */ 2351 $this->insertElement($token); 2352 break; 2353 2354 /* A start tag token whose tag name is "plaintext" */ 2355 case 'plaintext': 2356 /* If the stack of open elements has a p element in scope, 2357 then act as if an end tag with the tag name p had been 2358 seen. */ 2359 if ($this->elementInScope('p')) { 2360 $this->emitToken( 2361 array( 2362 'name' => 'p', 2363 'type' => HTML5::ENDTAG 2364 ) 2365 ); 2366 } 2367 2368 /* Insert an HTML element for the token. */ 2369 $this->insertElement($token); 2370 2371 return HTML5::PLAINTEXT; 2372 break; 2373 2374 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", 2375 "h5", "h6" */ 2376 case 'h1': 2377 case 'h2': 2378 case 'h3': 2379 case 'h4': 2380 case 'h5': 2381 case 'h6': 2382 /* If the stack of open elements has a p element in scope, 2383 then act as if an end tag with the tag name p had been seen. */ 2384 if ($this->elementInScope('p')) { 2385 $this->emitToken( 2386 array( 2387 'name' => 'p', 2388 'type' => HTML5::ENDTAG 2389 ) 2390 ); 2391 } 2392 2393 /* If the stack of open elements has in scope an element whose 2394 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2395 this is a parse error; pop elements from the stack until an 2396 element with one of those tag names has been popped from the 2397 stack. */ 2398 while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { 2399 array_pop($this->stack); 2400 } 2401 2402 /* Insert an HTML element for the token. */ 2403 $this->insertElement($token); 2404 break; 2405 2406 /* A start tag whose tag name is "a" */ 2407 case 'a': 2408 /* If the list of active formatting elements contains 2409 an element whose tag name is "a" between the end of the 2410 list and the last marker on the list (or the start of 2411 the list if there is no marker on the list), then this 2412 is a parse error; act as if an end tag with the tag name 2413 "a" had been seen, then remove that element from the list 2414 of active formatting elements and the stack of open 2415 elements if the end tag didn't already remove it (it 2416 might not have if the element is not in table scope). */ 2417 $leng = count($this->a_formatting); 2418 2419 for ($n = $leng - 1; $n >= 0; $n--) { 2420 if ($this->a_formatting[$n] === self::MARKER) { 2421 break; 2422 2423 } elseif ($this->a_formatting[$n]->nodeName === 'a') { 2424 $this->emitToken( 2425 array( 2426 'name' => 'a', 2427 'type' => HTML5::ENDTAG 2428 ) 2429 ); 2430 break; 2431 } 2432 } 2433 2434 /* Reconstruct the active formatting elements, if any. */ 2435 $this->reconstructActiveFormattingElements(); 2436 2437 /* Insert an HTML element for the token. */ 2438 $el = $this->insertElement($token); 2439 2440 /* Add that element to the list of active formatting 2441 elements. */ 2442 $this->a_formatting[] = $el; 2443 break; 2444 2445 /* A start tag whose tag name is one of: "b", "big", "em", "font", 2446 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2447 case 'b': 2448 case 'big': 2449 case 'em': 2450 case 'font': 2451 case 'i': 2452 case 'nobr': 2453 case 's': 2454 case 'small': 2455 case 'strike': 2456 case 'strong': 2457 case 'tt': 2458 case 'u': 2459 /* Reconstruct the active formatting elements, if any. */ 2460 $this->reconstructActiveFormattingElements(); 2461 2462 /* Insert an HTML element for the token. */ 2463 $el = $this->insertElement($token); 2464 2465 /* Add that element to the list of active formatting 2466 elements. */ 2467 $this->a_formatting[] = $el; 2468 break; 2469 2470 /* A start tag token whose tag name is "button" */ 2471 case 'button': 2472 /* If the stack of open elements has a button element in scope, 2473 then this is a parse error; act as if an end tag with the tag 2474 name "button" had been seen, then reprocess the token. (We don't 2475 do that. Unnecessary.) */ 2476 if ($this->elementInScope('button')) { 2477 $this->inBody( 2478 array( 2479 'name' => 'button', 2480 'type' => HTML5::ENDTAG 2481 ) 2482 ); 2483 } 2484 2485 /* Reconstruct the active formatting elements, if any. */ 2486 $this->reconstructActiveFormattingElements(); 2487 2488 /* Insert an HTML element for the token. */ 2489 $this->insertElement($token); 2490 2491 /* Insert a marker at the end of the list of active 2492 formatting elements. */ 2493 $this->a_formatting[] = self::MARKER; 2494 break; 2495 2496 /* A start tag token whose tag name is one of: "marquee", "object" */ 2497 case 'marquee': 2498 case 'object': 2499 /* Reconstruct the active formatting elements, if any. */ 2500 $this->reconstructActiveFormattingElements(); 2501 2502 /* Insert an HTML element for the token. */ 2503 $this->insertElement($token); 2504 2505 /* Insert a marker at the end of the list of active 2506 formatting elements. */ 2507 $this->a_formatting[] = self::MARKER; 2508 break; 2509 2510 /* A start tag token whose tag name is "xmp" */ 2511 case 'xmp': 2512 /* Reconstruct the active formatting elements, if any. */ 2513 $this->reconstructActiveFormattingElements(); 2514 2515 /* Insert an HTML element for the token. */ 2516 $this->insertElement($token); 2517 2518 /* Switch the content model flag to the CDATA state. */ 2519 return HTML5::CDATA; 2520 break; 2521 2522 /* A start tag whose tag name is "table" */ 2523 case 'table': 2524 /* If the stack of open elements has a p element in scope, 2525 then act as if an end tag with the tag name p had been seen. */ 2526 if ($this->elementInScope('p')) { 2527 $this->emitToken( 2528 array( 2529 'name' => 'p', 2530 'type' => HTML5::ENDTAG 2531 ) 2532 ); 2533 } 2534 2535 /* Insert an HTML element for the token. */ 2536 $this->insertElement($token); 2537 2538 /* Change the insertion mode to "in table". */ 2539 $this->mode = self::IN_TABLE; 2540 break; 2541 2542 /* A start tag whose tag name is one of: "area", "basefont", 2543 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ 2544 case 'area': 2545 case 'basefont': 2546 case 'bgsound': 2547 case 'br': 2548 case 'embed': 2549 case 'img': 2550 case 'param': 2551 case 'spacer': 2552 case 'wbr': 2553 /* Reconstruct the active formatting elements, if any. */ 2554 $this->reconstructActiveFormattingElements(); 2555 2556 /* Insert an HTML element for the token. */ 2557 $this->insertElement($token); 2558 2559 /* Immediately pop the current node off the stack of open elements. */ 2560 array_pop($this->stack); 2561 break; 2562 2563 /* A start tag whose tag name is "hr" */ 2564 case 'hr': 2565 /* If the stack of open elements has a p element in scope, 2566 then act as if an end tag with the tag name p had been seen. */ 2567 if ($this->elementInScope('p')) { 2568 $this->emitToken( 2569 array( 2570 'name' => 'p', 2571 'type' => HTML5::ENDTAG 2572 ) 2573 ); 2574 } 2575 2576 /* Insert an HTML element for the token. */ 2577 $this->insertElement($token); 2578 2579 /* Immediately pop the current node off the stack of open elements. */ 2580 array_pop($this->stack); 2581 break; 2582 2583 /* A start tag whose tag name is "image" */ 2584 case 'image': 2585 /* Parse error. Change the token's tag name to "img" and 2586 reprocess it. (Don't ask.) */ 2587 $token['name'] = 'img'; 2588 return $this->inBody($token); 2589 break; 2590 2591 /* A start tag whose tag name is "input" */ 2592 case 'input': 2593 /* Reconstruct the active formatting elements, if any. */ 2594 $this->reconstructActiveFormattingElements(); 2595 2596 /* Insert an input element for the token. */ 2597 $element = $this->insertElement($token, false); 2598 2599 /* If the form element pointer is not null, then associate the 2600 input element with the form element pointed to by the form 2601 element pointer. */ 2602 $this->form_pointer !== null 2603 ? $this->form_pointer->appendChild($element) 2604 : end($this->stack)->appendChild($element); 2605 2606 /* Pop that input element off the stack of open elements. */ 2607 array_pop($this->stack); 2608 break; 2609 2610 /* A start tag whose tag name is "isindex" */ 2611 case 'isindex': 2612 /* Parse error. */ 2613 // w/e 2614 2615 /* If the form element pointer is not null, 2616 then ignore the token. */ 2617 if ($this->form_pointer === null) { 2618 /* Act as if a start tag token with the tag name "form" had 2619 been seen. */ 2620 $this->inBody( 2621 array( 2622 'name' => 'body', 2623 'type' => HTML5::STARTTAG, 2624 'attr' => array() 2625 ) 2626 ); 2627 2628 /* Act as if a start tag token with the tag name "hr" had 2629 been seen. */ 2630 $this->inBody( 2631 array( 2632 'name' => 'hr', 2633 'type' => HTML5::STARTTAG, 2634 'attr' => array() 2635 ) 2636 ); 2637 2638 /* Act as if a start tag token with the tag name "p" had 2639 been seen. */ 2640 $this->inBody( 2641 array( 2642 'name' => 'p', 2643 'type' => HTML5::STARTTAG, 2644 'attr' => array() 2645 ) 2646 ); 2647 2648 /* Act as if a start tag token with the tag name "label" 2649 had been seen. */ 2650 $this->inBody( 2651 array( 2652 'name' => 'label', 2653 'type' => HTML5::STARTTAG, 2654 'attr' => array() 2655 ) 2656 ); 2657 2658 /* Act as if a stream of character tokens had been seen. */ 2659 $this->insertText( 2660 'This is a searchable index. ' . 2661 'Insert your search keywords here: ' 2662 ); 2663 2664 /* Act as if a start tag token with the tag name "input" 2665 had been seen, with all the attributes from the "isindex" 2666 token, except with the "name" attribute set to the value 2667 "isindex" (ignoring any explicit "name" attribute). */ 2668 $attr = $token['attr']; 2669 $attr[] = array('name' => 'name', 'value' => 'isindex'); 2670 2671 $this->inBody( 2672 array( 2673 'name' => 'input', 2674 'type' => HTML5::STARTTAG, 2675 'attr' => $attr 2676 ) 2677 ); 2678 2679 /* Act as if a stream of character tokens had been seen 2680 (see below for what they should say). */ 2681 $this->insertText( 2682 'This is a searchable index. ' . 2683 'Insert your search keywords here: ' 2684 ); 2685 2686 /* Act as if an end tag token with the tag name "label" 2687 had been seen. */ 2688 $this->inBody( 2689 array( 2690 'name' => 'label', 2691 'type' => HTML5::ENDTAG 2692 ) 2693 ); 2694 2695 /* Act as if an end tag token with the tag name "p" had 2696 been seen. */ 2697 $this->inBody( 2698 array( 2699 'name' => 'p', 2700 'type' => HTML5::ENDTAG 2701 ) 2702 ); 2703 2704 /* Act as if a start tag token with the tag name "hr" had 2705 been seen. */ 2706 $this->inBody( 2707 array( 2708 'name' => 'hr', 2709 'type' => HTML5::ENDTAG 2710 ) 2711 ); 2712 2713 /* Act as if an end tag token with the tag name "form" had 2714 been seen. */ 2715 $this->inBody( 2716 array( 2717 'name' => 'form', 2718 'type' => HTML5::ENDTAG 2719 ) 2720 ); 2721 } 2722 break; 2723 2724 /* A start tag whose tag name is "textarea" */ 2725 case 'textarea': 2726 $this->insertElement($token); 2727 2728 /* Switch the tokeniser's content model flag to the 2729 RCDATA state. */ 2730 return HTML5::RCDATA; 2731 break; 2732 2733 /* A start tag whose tag name is one of: "iframe", "noembed", 2734 "noframes" */ 2735 case 'iframe': 2736 case 'noembed': 2737 case 'noframes': 2738 $this->insertElement($token); 2739 2740 /* Switch the tokeniser's content model flag to the CDATA state. */ 2741 return HTML5::CDATA; 2742 break; 2743 2744 /* A start tag whose tag name is "select" */ 2745 case 'select': 2746 /* Reconstruct the active formatting elements, if any. */ 2747 $this->reconstructActiveFormattingElements(); 2748 2749 /* Insert an HTML element for the token. */ 2750 $this->insertElement($token); 2751 2752 /* Change the insertion mode to "in select". */ 2753 $this->mode = self::IN_SELECT; 2754 break; 2755 2756 /* A start or end tag whose tag name is one of: "caption", "col", 2757 "colgroup", "frame", "frameset", "head", "option", "optgroup", 2758 "tbody", "td", "tfoot", "th", "thead", "tr". */ 2759 case 'caption': 2760 case 'col': 2761 case 'colgroup': 2762 case 'frame': 2763 case 'frameset': 2764 case 'head': 2765 case 'option': 2766 case 'optgroup': 2767 case 'tbody': 2768 case 'td': 2769 case 'tfoot': 2770 case 'th': 2771 case 'thead': 2772 case 'tr': 2773 // Parse error. Ignore the token. 2774 break; 2775 2776 /* A start or end tag whose tag name is one of: "event-source", 2777 "section", "nav", "article", "aside", "header", "footer", 2778 "datagrid", "command" */ 2779 case 'event-source': 2780 case 'section': 2781 case 'nav': 2782 case 'article': 2783 case 'aside': 2784 case 'header': 2785 case 'footer': 2786 case 'datagrid': 2787 case 'command': 2788 // Work in progress! 2789 break; 2790 2791 /* A start tag token not covered by the previous entries */ 2792 default: 2793 /* Reconstruct the active formatting elements, if any. */ 2794 $this->reconstructActiveFormattingElements(); 2795 2796 $this->insertElement($token, true, true); 2797 break; 2798 } 2799 break; 2800 2801 case HTML5::ENDTAG: 2802 switch ($token['name']) { 2803 /* An end tag with the tag name "body" */ 2804 case 'body': 2805 /* If the second element in the stack of open elements is 2806 not a body element, this is a parse error. Ignore the token. 2807 (innerHTML case) */ 2808 if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { 2809 // Ignore. 2810 2811 /* If the current node is not the body element, then this 2812 is a parse error. */ 2813 } elseif (end($this->stack)->nodeName !== 'body') { 2814 // Parse error. 2815 } 2816 2817 /* Change the insertion mode to "after body". */ 2818 $this->mode = self::AFTER_BODY; 2819 break; 2820 2821 /* An end tag with the tag name "html" */ 2822 case 'html': 2823 /* Act as if an end tag with tag name "body" had been seen, 2824 then, if that token wasn't ignored, reprocess the current 2825 token. */ 2826 $this->inBody( 2827 array( 2828 'name' => 'body', 2829 'type' => HTML5::ENDTAG 2830 ) 2831 ); 2832 2833 return $this->afterBody($token); 2834 break; 2835 2836 /* An end tag whose tag name is one of: "address", "blockquote", 2837 "center", "dir", "div", "dl", "fieldset", "listing", "menu", 2838 "ol", "pre", "ul" */ 2839 case 'address': 2840 case 'blockquote': 2841 case 'center': 2842 case 'dir': 2843 case 'div': 2844 case 'dl': 2845 case 'fieldset': 2846 case 'listing': 2847 case 'menu': 2848 case 'ol': 2849 case 'pre': 2850 case 'ul': 2851 /* If the stack of open elements has an element in scope 2852 with the same tag name as that of the token, then generate 2853 implied end tags. */ 2854 if ($this->elementInScope($token['name'])) { 2855 $this->generateImpliedEndTags(); 2856 2857 /* Now, if the current node is not an element with 2858 the same tag name as that of the token, then this 2859 is a parse error. */ 2860 // w/e 2861 2862 /* If the stack of open elements has an element in 2863 scope with the same tag name as that of the token, 2864 then pop elements from this stack until an element 2865 with that tag name has been popped from the stack. */ 2866 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 2867 if ($this->stack[$n]->nodeName === $token['name']) { 2868 $n = -1; 2869 } 2870 2871 array_pop($this->stack); 2872 } 2873 } 2874 break; 2875 2876 /* An end tag whose tag name is "form" */ 2877 case 'form': 2878 /* If the stack of open elements has an element in scope 2879 with the same tag name as that of the token, then generate 2880 implied end tags. */ 2881 if ($this->elementInScope($token['name'])) { 2882 $this->generateImpliedEndTags(); 2883 2884 } 2885 2886 if (end($this->stack)->nodeName !== $token['name']) { 2887 /* Now, if the current node is not an element with the 2888 same tag name as that of the token, then this is a parse 2889 error. */ 2890 // w/e 2891 2892 } else { 2893 /* Otherwise, if the current node is an element with 2894 the same tag name as that of the token pop that element 2895 from the stack. */ 2896 array_pop($this->stack); 2897 } 2898 2899 /* In any case, set the form element pointer to null. */ 2900 $this->form_pointer = null; 2901 break; 2902 2903 /* An end tag whose tag name is "p" */ 2904 case 'p': 2905 /* If the stack of open elements has a p element in scope, 2906 then generate implied end tags, except for p elements. */ 2907 if ($this->elementInScope('p')) { 2908 $this->generateImpliedEndTags(array('p')); 2909 2910 /* If the current node is not a p element, then this is 2911 a parse error. */ 2912 // k 2913 2914 /* If the stack of open elements has a p element in 2915 scope, then pop elements from this stack until the stack 2916 no longer has a p element in scope. */ 2917 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 2918 if ($this->elementInScope('p')) { 2919 array_pop($this->stack); 2920 2921 } else { 2922 break; 2923 } 2924 } 2925 } 2926 break; 2927 2928 /* An end tag whose tag name is "dd", "dt", or "li" */ 2929 case 'dd': 2930 case 'dt': 2931 case 'li': 2932 /* If the stack of open elements has an element in scope 2933 whose tag name matches the tag name of the token, then 2934 generate implied end tags, except for elements with the 2935 same tag name as the token. */ 2936 if ($this->elementInScope($token['name'])) { 2937 $this->generateImpliedEndTags(array($token['name'])); 2938 2939 /* If the current node is not an element with the same 2940 tag name as the token, then this is a parse error. */ 2941 // w/e 2942 2943 /* If the stack of open elements has an element in scope 2944 whose tag name matches the tag name of the token, then 2945 pop elements from this stack until an element with that 2946 tag name has been popped from the stack. */ 2947 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 2948 if ($this->stack[$n]->nodeName === $token['name']) { 2949 $n = -1; 2950 } 2951 2952 array_pop($this->stack); 2953 } 2954 } 2955 break; 2956 2957 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", 2958 "h5", "h6" */ 2959 case 'h1': 2960 case 'h2': 2961 case 'h3': 2962 case 'h4': 2963 case 'h5': 2964 case 'h6': 2965 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); 2966 2967 /* If the stack of open elements has in scope an element whose 2968 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2969 generate implied end tags. */ 2970 if ($this->elementInScope($elements)) { 2971 $this->generateImpliedEndTags(); 2972 2973 /* Now, if the current node is not an element with the same 2974 tag name as that of the token, then this is a parse error. */ 2975 // w/e 2976 2977 /* If the stack of open elements has in scope an element 2978 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or 2979 "h6", then pop elements from the stack until an element 2980 with one of those tag names has been popped from the stack. */ 2981 while ($this->elementInScope($elements)) { 2982 array_pop($this->stack); 2983 } 2984 } 2985 break; 2986 2987 /* An end tag whose tag name is one of: "a", "b", "big", "em", 2988 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2989 case 'a': 2990 case 'b': 2991 case 'big': 2992 case 'em': 2993 case 'font': 2994 case 'i': 2995 case 'nobr': 2996 case 's': 2997 case 'small': 2998 case 'strike': 2999 case 'strong': 3000 case 'tt': 3001 case 'u': 3002 /* 1. Let the formatting element be the last element in 3003 the list of active formatting elements that: 3004 * is between the end of the list and the last scope 3005 marker in the list, if any, or the start of the list 3006 otherwise, and 3007 * has the same tag name as the token. 3008 */ 3009 while (true) { 3010 for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) { 3011 if ($this->a_formatting[$a] === self::MARKER) { 3012 break; 3013 3014 } elseif ($this->a_formatting[$a]->tagName === $token['name']) { 3015 $formatting_element = $this->a_formatting[$a]; 3016 $in_stack = in_array($formatting_element, $this->stack, true); 3017 $fe_af_pos = $a; 3018 break; 3019 } 3020 } 3021 3022 /* If there is no such node, or, if that node is 3023 also in the stack of open elements but the element 3024 is not in scope, then this is a parse error. Abort 3025 these steps. The token is ignored. */ 3026 if (!isset($formatting_element) || ($in_stack && 3027 !$this->elementInScope($token['name'])) 3028 ) { 3029 break; 3030 3031 /* Otherwise, if there is such a node, but that node 3032 is not in the stack of open elements, then this is a 3033 parse error; remove the element from the list, and 3034 abort these steps. */ 3035 } elseif (isset($formatting_element) && !$in_stack) { 3036 unset($this->a_formatting[$fe_af_pos]); 3037 $this->a_formatting = array_merge($this->a_formatting); 3038 break; 3039 } 3040 3041 /* 2. Let the furthest block be the topmost node in the 3042 stack of open elements that is lower in the stack 3043 than the formatting element, and is not an element in 3044 the phrasing or formatting categories. There might 3045 not be one. */ 3046 $fe_s_pos = array_search($formatting_element, $this->stack, true); 3047 $length = count($this->stack); 3048 3049 for ($s = $fe_s_pos + 1; $s < $length; $s++) { 3050 $category = $this->getElementCategory($this->stack[$s]->nodeName); 3051 3052 if ($category !== self::PHRASING && $category !== self::FORMATTING) { 3053 $furthest_block = $this->stack[$s]; 3054 } 3055 } 3056 3057 /* 3. If there is no furthest block, then the UA must 3058 skip the subsequent steps and instead just pop all 3059 the nodes from the bottom of the stack of open 3060 elements, from the current node up to the formatting 3061 element, and remove the formatting element from the 3062 list of active formatting elements. */ 3063 if (!isset($furthest_block)) { 3064 for ($n = $length - 1; $n >= $fe_s_pos; $n--) { 3065 array_pop($this->stack); 3066 } 3067 3068 unset($this->a_formatting[$fe_af_pos]); 3069 $this->a_formatting = array_merge($this->a_formatting); 3070 break; 3071 } 3072 3073 /* 4. Let the common ancestor be the element 3074 immediately above the formatting element in the stack 3075 of open elements. */ 3076 $common_ancestor = $this->stack[$fe_s_pos - 1]; 3077 3078 /* 5. If the furthest block has a parent node, then 3079 remove the furthest block from its parent node. */ 3080 if ($furthest_block->parentNode !== null) { 3081 $furthest_block->parentNode->removeChild($furthest_block); 3082 } 3083 3084 /* 6. Let a bookmark note the position of the 3085 formatting element in the list of active formatting 3086 elements relative to the elements on either side 3087 of it in the list. */ 3088 $bookmark = $fe_af_pos; 3089 3090 /* 7. Let node and last node be the furthest block. 3091 Follow these steps: */ 3092 $node = $furthest_block; 3093 $last_node = $furthest_block; 3094 3095 while (true) { 3096 for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { 3097 /* 7.1 Let node be the element immediately 3098 prior to node in the stack of open elements. */ 3099 $node = $this->stack[$n]; 3100 3101 /* 7.2 If node is not in the list of active 3102 formatting elements, then remove node from 3103 the stack of open elements and then go back 3104 to step 1. */ 3105 if (!in_array($node, $this->a_formatting, true)) { 3106 unset($this->stack[$n]); 3107 $this->stack = array_merge($this->stack); 3108 3109 } else { 3110 break; 3111 } 3112 } 3113 3114 /* 7.3 Otherwise, if node is the formatting 3115 element, then go to the next step in the overall 3116 algorithm. */ 3117 if ($node === $formatting_element) { 3118 break; 3119 3120 /* 7.4 Otherwise, if last node is the furthest 3121 block, then move the aforementioned bookmark to 3122 be immediately after the node in the list of 3123 active formatting elements. */ 3124 } elseif ($last_node === $furthest_block) { 3125 $bookmark = array_search($node, $this->a_formatting, true) + 1; 3126 } 3127 3128 /* 7.5 If node has any children, perform a 3129 shallow clone of node, replace the entry for 3130 node in the list of active formatting elements 3131 with an entry for the clone, replace the entry 3132 for node in the stack of open elements with an 3133 entry for the clone, and let node be the clone. */ 3134 if ($node->hasChildNodes()) { 3135 $clone = $node->cloneNode(); 3136 $s_pos = array_search($node, $this->stack, true); 3137 $a_pos = array_search($node, $this->a_formatting, true); 3138 3139 $this->stack[$s_pos] = $clone; 3140 $this->a_formatting[$a_pos] = $clone; 3141 $node = $clone; 3142 } 3143 3144 /* 7.6 Insert last node into node, first removing 3145 it from its previous parent node if any. */ 3146 if ($last_node->parentNode !== null) { 3147 $last_node->parentNode->removeChild($last_node); 3148 } 3149 3150 $node->appendChild($last_node); 3151 3152 /* 7.7 Let last node be node. */ 3153 $last_node = $node; 3154 } 3155 3156 /* 8. Insert whatever last node ended up being in 3157 the previous step into the common ancestor node, 3158 first removing it from its previous parent node if 3159 any. */ 3160 if ($last_node->parentNode !== null) { 3161 $last_node->parentNode->removeChild($last_node); 3162 } 3163 3164 $common_ancestor->appendChild($last_node); 3165 3166 /* 9. Perform a shallow clone of the formatting 3167 element. */ 3168 $clone = $formatting_element->cloneNode(); 3169 3170 /* 10. Take all of the child nodes of the furthest 3171 block and append them to the clone created in the 3172 last step. */ 3173 while ($furthest_block->hasChildNodes()) { 3174 $child = $furthest_block->firstChild; 3175 $furthest_block->removeChild($child); 3176 $clone->appendChild($child); 3177 } 3178 3179 /* 11. Append that clone to the furthest block. */ 3180 $furthest_block->appendChild($clone); 3181 3182 /* 12. Remove the formatting element from the list 3183 of active formatting elements, and insert the clone 3184 into the list of active formatting elements at the 3185 position of the aforementioned bookmark. */ 3186 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); 3187 unset($this->a_formatting[$fe_af_pos]); 3188 $this->a_formatting = array_merge($this->a_formatting); 3189 3190 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); 3191 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); 3192 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); 3193 3194 /* 13. Remove the formatting element from the stack 3195 of open elements, and insert the clone into the stack 3196 of open elements immediately after (i.e. in a more 3197 deeply nested position than) the position of the 3198 furthest block in that stack. */ 3199 $fe_s_pos = array_search($formatting_element, $this->stack, true); 3200 $fb_s_pos = array_search($furthest_block, $this->stack, true); 3201 unset($this->stack[$fe_s_pos]); 3202 3203 $s_part1 = array_slice($this->stack, 0, $fb_s_pos); 3204 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); 3205 $this->stack = array_merge($s_part1, array($clone), $s_part2); 3206 3207 /* 14. Jump back to step 1 in this series of steps. */ 3208 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); 3209 } 3210 break; 3211 3212 /* An end tag token whose tag name is one of: "button", 3213 "marquee", "object" */ 3214 case 'button': 3215 case 'marquee': 3216 case 'object': 3217 /* If the stack of open elements has an element in scope whose 3218 tag name matches the tag name of the token, then generate implied 3219 tags. */ 3220 if ($this->elementInScope($token['name'])) { 3221 $this->generateImpliedEndTags(); 3222 3223 /* Now, if the current node is not an element with the same 3224 tag name as the token, then this is a parse error. */ 3225 // k 3226 3227 /* Now, if the stack of open elements has an element in scope 3228 whose tag name matches the tag name of the token, then pop 3229 elements from the stack until that element has been popped from 3230 the stack, and clear the list of active formatting elements up 3231 to the last marker. */ 3232 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 3233 if ($this->stack[$n]->nodeName === $token['name']) { 3234 $n = -1; 3235 } 3236 3237 array_pop($this->stack); 3238 } 3239 3240 $marker = end(array_keys($this->a_formatting, self::MARKER, true)); 3241 3242 for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) { 3243 array_pop($this->a_formatting); 3244 } 3245 } 3246 break; 3247 3248 /* Or an end tag whose tag name is one of: "area", "basefont", 3249 "bgsound", "br", "embed", "hr", "iframe", "image", "img", 3250 "input", "isindex", "noembed", "noframes", "param", "select", 3251 "spacer", "table", "textarea", "wbr" */ 3252 case 'area': 3253 case 'basefont': 3254 case 'bgsound': 3255 case 'br': 3256 case 'embed': 3257 case 'hr': 3258 case 'iframe': 3259 case 'image': 3260 case 'img': 3261 case 'input': 3262 case 'isindex': 3263 case 'noembed': 3264 case 'noframes': 3265 case 'param': 3266 case 'select': 3267 case 'spacer': 3268 case 'table': 3269 case 'textarea': 3270 case 'wbr': 3271 // Parse error. Ignore the token. 3272 break; 3273 3274 /* An end tag token not covered by the previous entries */ 3275 default: 3276 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 3277 /* Initialise node to be the current node (the bottommost 3278 node of the stack). */ 3279 $node = end($this->stack); 3280 3281 /* If node has the same tag name as the end tag token, 3282 then: */ 3283 if ($token['name'] === $node->nodeName) { 3284 /* Generate implied end tags. */ 3285 $this->generateImpliedEndTags(); 3286 3287 /* If the tag name of the end tag token does not 3288 match the tag name of the current node, this is a 3289 parse error. */ 3290 // k 3291 3292 /* Pop all the nodes from the current node up to 3293 node, including node, then stop this algorithm. */ 3294 for ($x = count($this->stack) - $n; $x >= $n; $x--) { 3295 array_pop($this->stack); 3296 } 3297 3298 } else { 3299 $category = $this->getElementCategory($node); 3300 3301 if ($category !== self::SPECIAL && $category !== self::SCOPING) { 3302 /* Otherwise, if node is in neither the formatting 3303 category nor the phrasing category, then this is a 3304 parse error. Stop this algorithm. The end tag token 3305 is ignored. */ 3306 return false; 3307 } 3308 } 3309 } 3310 break; 3311 } 3312 break; 3313 } 3314 } 3315 3316 private function inTable($token) 3317 { 3318 $clear = array('html', 'table'); 3319 3320 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3321 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3322 or U+0020 SPACE */ 3323 if ($token['type'] === HTML5::CHARACTR && 3324 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 3325 ) { 3326 /* Append the character to the current node. */ 3327 $text = $this->dom->createTextNode($token['data']); 3328 end($this->stack)->appendChild($text); 3329 3330 /* A comment token */ 3331 } elseif ($token['type'] === HTML5::COMMENT) { 3332 /* Append a Comment node to the current node with the data 3333 attribute set to the data given in the comment token. */ 3334 $comment = $this->dom->createComment($token['data']); 3335 end($this->stack)->appendChild($comment); 3336 3337 /* A start tag whose tag name is "caption" */ 3338 } elseif ($token['type'] === HTML5::STARTTAG && 3339 $token['name'] === 'caption' 3340 ) { 3341 /* Clear the stack back to a table context. */ 3342 $this->clearStackToTableContext($clear); 3343 3344 /* Insert a marker at the end of the list of active 3345 formatting elements. */ 3346 $this->a_formatting[] = self::MARKER; 3347 3348 /* Insert an HTML element for the token, then switch the 3349 insertion mode to "in caption". */ 3350 $this->insertElement($token); 3351 $this->mode = self::IN_CAPTION; 3352 3353 /* A start tag whose tag name is "colgroup" */ 3354 } elseif ($token['type'] === HTML5::STARTTAG && 3355 $token['name'] === 'colgroup' 3356 ) { 3357 /* Clear the stack back to a table context. */ 3358 $this->clearStackToTableContext($clear); 3359 3360 /* Insert an HTML element for the token, then switch the 3361 insertion mode to "in column group". */ 3362 $this->insertElement($token); 3363 $this->mode = self::IN_CGROUP; 3364 3365 /* A start tag whose tag name is "col" */ 3366 } elseif ($token['type'] === HTML5::STARTTAG && 3367 $token['name'] === 'col' 3368 ) { 3369 $this->inTable( 3370 array( 3371 'name' => 'colgroup', 3372 'type' => HTML5::STARTTAG, 3373 'attr' => array() 3374 ) 3375 ); 3376 3377 $this->inColumnGroup($token); 3378 3379 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3380 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3381 $token['name'], 3382 array('tbody', 'tfoot', 'thead') 3383 ) 3384 ) { 3385 /* Clear the stack back to a table context. */ 3386 $this->clearStackToTableContext($clear); 3387 3388 /* Insert an HTML element for the token, then switch the insertion 3389 mode to "in table body". */ 3390 $this->insertElement($token); 3391 $this->mode = self::IN_TBODY; 3392 3393 /* A start tag whose tag name is one of: "td", "th", "tr" */ 3394 } elseif ($token['type'] === HTML5::STARTTAG && 3395 in_array($token['name'], array('td', 'th', 'tr')) 3396 ) { 3397 /* Act as if a start tag token with the tag name "tbody" had been 3398 seen, then reprocess the current token. */ 3399 $this->inTable( 3400 array( 3401 'name' => 'tbody', 3402 'type' => HTML5::STARTTAG, 3403 'attr' => array() 3404 ) 3405 ); 3406 3407 return $this->inTableBody($token); 3408 3409 /* A start tag whose tag name is "table" */ 3410 } elseif ($token['type'] === HTML5::STARTTAG && 3411 $token['name'] === 'table' 3412 ) { 3413 /* Parse error. Act as if an end tag token with the tag name "table" 3414 had been seen, then, if that token wasn't ignored, reprocess the 3415 current token. */ 3416 $this->inTable( 3417 array( 3418 'name' => 'table', 3419 'type' => HTML5::ENDTAG 3420 ) 3421 ); 3422 3423 return $this->mainPhase($token); 3424 3425 /* An end tag whose tag name is "table" */ 3426 } elseif ($token['type'] === HTML5::ENDTAG && 3427 $token['name'] === 'table' 3428 ) { 3429 /* If the stack of open elements does not have an element in table 3430 scope with the same tag name as the token, this is a parse error. 3431 Ignore the token. (innerHTML case) */ 3432 if (!$this->elementInScope($token['name'], true)) { 3433 return false; 3434 3435 /* Otherwise: */ 3436 } else { 3437 /* Generate implied end tags. */ 3438 $this->generateImpliedEndTags(); 3439 3440 /* Now, if the current node is not a table element, then this 3441 is a parse error. */ 3442 // w/e 3443 3444 /* Pop elements from this stack until a table element has been 3445 popped from the stack. */ 3446 while (true) { 3447 $current = end($this->stack)->nodeName; 3448 array_pop($this->stack); 3449 3450 if ($current === 'table') { 3451 break; 3452 } 3453 } 3454 3455 /* Reset the insertion mode appropriately. */ 3456 $this->resetInsertionMode(); 3457 } 3458 3459 /* An end tag whose tag name is one of: "body", "caption", "col", 3460 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 3461 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3462 $token['name'], 3463 array( 3464 'body', 3465 'caption', 3466 'col', 3467 'colgroup', 3468 'html', 3469 'tbody', 3470 'td', 3471 'tfoot', 3472 'th', 3473 'thead', 3474 'tr' 3475 ) 3476 ) 3477 ) { 3478 // Parse error. Ignore the token. 3479 3480 /* Anything else */ 3481 } else { 3482 /* Parse error. Process the token as if the insertion mode was "in 3483 body", with the following exception: */ 3484 3485 /* If the current node is a table, tbody, tfoot, thead, or tr 3486 element, then, whenever a node would be inserted into the current 3487 node, it must instead be inserted into the foster parent element. */ 3488 if (in_array( 3489 end($this->stack)->nodeName, 3490 array('table', 'tbody', 'tfoot', 'thead', 'tr') 3491 ) 3492 ) { 3493 /* The foster parent element is the parent element of the last 3494 table element in the stack of open elements, if there is a 3495 table element and it has such a parent element. If there is no 3496 table element in the stack of open elements (innerHTML case), 3497 then the foster parent element is the first element in the 3498 stack of open elements (the html element). Otherwise, if there 3499 is a table element in the stack of open elements, but the last 3500 table element in the stack of open elements has no parent, or 3501 its parent node is not an element, then the foster parent 3502 element is the element before the last table element in the 3503 stack of open elements. */ 3504 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 3505 if ($this->stack[$n]->nodeName === 'table') { 3506 $table = $this->stack[$n]; 3507 break; 3508 } 3509 } 3510 3511 if (isset($table) && $table->parentNode !== null) { 3512 $this->foster_parent = $table->parentNode; 3513 3514 } elseif (!isset($table)) { 3515 $this->foster_parent = $this->stack[0]; 3516 3517 } elseif (isset($table) && ($table->parentNode === null || 3518 $table->parentNode->nodeType !== XML_ELEMENT_NODE) 3519 ) { 3520 $this->foster_parent = $this->stack[$n - 1]; 3521 } 3522 } 3523 3524 $this->inBody($token); 3525 } 3526 } 3527 3528 private function inCaption($token) 3529 { 3530 /* An end tag whose tag name is "caption" */ 3531 if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { 3532 /* If the stack of open elements does not have an element in table 3533 scope with the same tag name as the token, this is a parse error. 3534 Ignore the token. (innerHTML case) */ 3535 if (!$this->elementInScope($token['name'], true)) { 3536 // Ignore 3537 3538 /* Otherwise: */ 3539 } else { 3540 /* Generate implied end tags. */ 3541 $this->generateImpliedEndTags(); 3542 3543 /* Now, if the current node is not a caption element, then this 3544 is a parse error. */ 3545 // w/e 3546 3547 /* Pop elements from this stack until a caption element has 3548 been popped from the stack. */ 3549 while (true) { 3550 $node = end($this->stack)->nodeName; 3551 array_pop($this->stack); 3552 3553 if ($node === 'caption') { 3554 break; 3555 } 3556 } 3557 3558 /* Clear the list of active formatting elements up to the last 3559 marker. */ 3560 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 3561 3562 /* Switch the insertion mode to "in table". */ 3563 $this->mode = self::IN_TABLE; 3564 } 3565 3566 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3567 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag 3568 name is "table" */ 3569 } elseif (($token['type'] === HTML5::STARTTAG && in_array( 3570 $token['name'], 3571 array( 3572 'caption', 3573 'col', 3574 'colgroup', 3575 'tbody', 3576 'td', 3577 'tfoot', 3578 'th', 3579 'thead', 3580 'tr' 3581 ) 3582 )) || ($token['type'] === HTML5::ENDTAG && 3583 $token['name'] === 'table') 3584 ) { 3585 /* Parse error. Act as if an end tag with the tag name "caption" 3586 had been seen, then, if that token wasn't ignored, reprocess the 3587 current token. */ 3588 $this->inCaption( 3589 array( 3590 'name' => 'caption', 3591 'type' => HTML5::ENDTAG 3592 ) 3593 ); 3594 3595 return $this->inTable($token); 3596 3597 /* An end tag whose tag name is one of: "body", "col", "colgroup", 3598 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 3599 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3600 $token['name'], 3601 array( 3602 'body', 3603 'col', 3604 'colgroup', 3605 'html', 3606 'tbody', 3607 'tfoot', 3608 'th', 3609 'thead', 3610 'tr' 3611 ) 3612 ) 3613 ) { 3614 // Parse error. Ignore the token. 3615 3616 /* Anything else */ 3617 } else { 3618 /* Process the token as if the insertion mode was "in body". */ 3619 $this->inBody($token); 3620 } 3621 } 3622 3623 private function inColumnGroup($token) 3624 { 3625 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3626 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3627 or U+0020 SPACE */ 3628 if ($token['type'] === HTML5::CHARACTR && 3629 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 3630 ) { 3631 /* Append the character to the current node. */ 3632 $text = $this->dom->createTextNode($token['data']); 3633 end($this->stack)->appendChild($text); 3634 3635 /* A comment token */ 3636 } elseif ($token['type'] === HTML5::COMMENT) { 3637 /* Append a Comment node to the current node with the data 3638 attribute set to the data given in the comment token. */ 3639 $comment = $this->dom->createComment($token['data']); 3640 end($this->stack)->appendChild($comment); 3641 3642 /* A start tag whose tag name is "col" */ 3643 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { 3644 /* Insert a col element for the token. Immediately pop the current 3645 node off the stack of open elements. */ 3646 $this->insertElement($token); 3647 array_pop($this->stack); 3648 3649 /* An end tag whose tag name is "colgroup" */ 3650 } elseif ($token['type'] === HTML5::ENDTAG && 3651 $token['name'] === 'colgroup' 3652 ) { 3653 /* If the current node is the root html element, then this is a 3654 parse error, ignore the token. (innerHTML case) */ 3655 if (end($this->stack)->nodeName === 'html') { 3656 // Ignore 3657 3658 /* Otherwise, pop the current node (which will be a colgroup 3659 element) from the stack of open elements. Switch the insertion 3660 mode to "in table". */ 3661 } else { 3662 array_pop($this->stack); 3663 $this->mode = self::IN_TABLE; 3664 } 3665 3666 /* An end tag whose tag name is "col" */ 3667 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { 3668 /* Parse error. Ignore the token. */ 3669 3670 /* Anything else */ 3671 } else { 3672 /* Act as if an end tag with the tag name "colgroup" had been seen, 3673 and then, if that token wasn't ignored, reprocess the current token. */ 3674 $this->inColumnGroup( 3675 array( 3676 'name' => 'colgroup', 3677 'type' => HTML5::ENDTAG 3678 ) 3679 ); 3680 3681 return $this->inTable($token); 3682 } 3683 } 3684 3685 private function inTableBody($token) 3686 { 3687 $clear = array('tbody', 'tfoot', 'thead', 'html'); 3688 3689 /* A start tag whose tag name is "tr" */ 3690 if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { 3691 /* Clear the stack back to a table body context. */ 3692 $this->clearStackToTableContext($clear); 3693 3694 /* Insert a tr element for the token, then switch the insertion 3695 mode to "in row". */ 3696 $this->insertElement($token); 3697 $this->mode = self::IN_ROW; 3698 3699 /* A start tag whose tag name is one of: "th", "td" */ 3700 } elseif ($token['type'] === HTML5::STARTTAG && 3701 ($token['name'] === 'th' || $token['name'] === 'td') 3702 ) { 3703 /* Parse error. Act as if a start tag with the tag name "tr" had 3704 been seen, then reprocess the current token. */ 3705 $this->inTableBody( 3706 array( 3707 'name' => 'tr', 3708 'type' => HTML5::STARTTAG, 3709 'attr' => array() 3710 ) 3711 ); 3712 3713 return $this->inRow($token); 3714 3715 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3716 } elseif ($token['type'] === HTML5::ENDTAG && 3717 in_array($token['name'], array('tbody', 'tfoot', 'thead')) 3718 ) { 3719 /* If the stack of open elements does not have an element in table 3720 scope with the same tag name as the token, this is a parse error. 3721 Ignore the token. */ 3722 if (!$this->elementInScope($token['name'], true)) { 3723 // Ignore 3724 3725 /* Otherwise: */ 3726 } else { 3727 /* Clear the stack back to a table body context. */ 3728 $this->clearStackToTableContext($clear); 3729 3730 /* Pop the current node from the stack of open elements. Switch 3731 the insertion mode to "in table". */ 3732 array_pop($this->stack); 3733 $this->mode = self::IN_TABLE; 3734 } 3735 3736 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3737 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ 3738 } elseif (($token['type'] === HTML5::STARTTAG && in_array( 3739 $token['name'], 3740 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead') 3741 )) || 3742 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table') 3743 ) { 3744 /* If the stack of open elements does not have a tbody, thead, or 3745 tfoot element in table scope, this is a parse error. Ignore the 3746 token. (innerHTML case) */ 3747 if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { 3748 // Ignore. 3749 3750 /* Otherwise: */ 3751 } else { 3752 /* Clear the stack back to a table body context. */ 3753 $this->clearStackToTableContext($clear); 3754 3755 /* Act as if an end tag with the same tag name as the current 3756 node ("tbody", "tfoot", or "thead") had been seen, then 3757 reprocess the current token. */ 3758 $this->inTableBody( 3759 array( 3760 'name' => end($this->stack)->nodeName, 3761 'type' => HTML5::ENDTAG 3762 ) 3763 ); 3764 3765 return $this->mainPhase($token); 3766 } 3767 3768 /* An end tag whose tag name is one of: "body", "caption", "col", 3769 "colgroup", "html", "td", "th", "tr" */ 3770 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3771 $token['name'], 3772 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') 3773 ) 3774 ) { 3775 /* Parse error. Ignore the token. */ 3776 3777 /* Anything else */ 3778 } else { 3779 /* Process the token as if the insertion mode was "in table". */ 3780 $this->inTable($token); 3781 } 3782 } 3783 3784 private function inRow($token) 3785 { 3786 $clear = array('tr', 'html'); 3787 3788 /* A start tag whose tag name is one of: "th", "td" */ 3789 if ($token['type'] === HTML5::STARTTAG && 3790 ($token['name'] === 'th' || $token['name'] === 'td') 3791 ) { 3792 /* Clear the stack back to a table row context. */ 3793 $this->clearStackToTableContext($clear); 3794 3795 /* Insert an HTML element for the token, then switch the insertion 3796 mode to "in cell". */ 3797 $this->insertElement($token); 3798 $this->mode = self::IN_CELL; 3799 3800 /* Insert a marker at the end of the list of active formatting 3801 elements. */ 3802 $this->a_formatting[] = self::MARKER; 3803 3804 /* An end tag whose tag name is "tr" */ 3805 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { 3806 /* If the stack of open elements does not have an element in table 3807 scope with the same tag name as the token, this is a parse error. 3808 Ignore the token. (innerHTML case) */ 3809 if (!$this->elementInScope($token['name'], true)) { 3810 // Ignore. 3811 3812 /* Otherwise: */ 3813 } else { 3814 /* Clear the stack back to a table row context. */ 3815 $this->clearStackToTableContext($clear); 3816 3817 /* Pop the current node (which will be a tr element) from the 3818 stack of open elements. Switch the insertion mode to "in table 3819 body". */ 3820 array_pop($this->stack); 3821 $this->mode = self::IN_TBODY; 3822 } 3823 3824 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3825 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ 3826 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3827 $token['name'], 3828 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr') 3829 ) 3830 ) { 3831 /* Act as if an end tag with the tag name "tr" had been seen, then, 3832 if that token wasn't ignored, reprocess the current token. */ 3833 $this->inRow( 3834 array( 3835 'name' => 'tr', 3836 'type' => HTML5::ENDTAG 3837 ) 3838 ); 3839 3840 return $this->inCell($token); 3841 3842 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3843 } elseif ($token['type'] === HTML5::ENDTAG && 3844 in_array($token['name'], array('tbody', 'tfoot', 'thead')) 3845 ) { 3846 /* If the stack of open elements does not have an element in table 3847 scope with the same tag name as the token, this is a parse error. 3848 Ignore the token. */ 3849 if (!$this->elementInScope($token['name'], true)) { 3850 // Ignore. 3851 3852 /* Otherwise: */ 3853 } else { 3854 /* Otherwise, act as if an end tag with the tag name "tr" had 3855 been seen, then reprocess the current token. */ 3856 $this->inRow( 3857 array( 3858 'name' => 'tr', 3859 'type' => HTML5::ENDTAG 3860 ) 3861 ); 3862 3863 return $this->inCell($token); 3864 } 3865 3866 /* An end tag whose tag name is one of: "body", "caption", "col", 3867 "colgroup", "html", "td", "th" */ 3868 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3869 $token['name'], 3870 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') 3871 ) 3872 ) { 3873 /* Parse error. Ignore the token. */ 3874 3875 /* Anything else */ 3876 } else { 3877 /* Process the token as if the insertion mode was "in table". */ 3878 $this->inTable($token); 3879 } 3880 } 3881 3882 private function inCell($token) 3883 { 3884 /* An end tag whose tag name is one of: "td", "th" */ 3885 if ($token['type'] === HTML5::ENDTAG && 3886 ($token['name'] === 'td' || $token['name'] === 'th') 3887 ) { 3888 /* If the stack of open elements does not have an element in table 3889 scope with the same tag name as that of the token, then this is a 3890 parse error and the token must be ignored. */ 3891 if (!$this->elementInScope($token['name'], true)) { 3892 // Ignore. 3893 3894 /* Otherwise: */ 3895 } else { 3896 /* Generate implied end tags, except for elements with the same 3897 tag name as the token. */ 3898 $this->generateImpliedEndTags(array($token['name'])); 3899 3900 /* Now, if the current node is not an element with the same tag 3901 name as the token, then this is a parse error. */ 3902 // k 3903 3904 /* Pop elements from this stack until an element with the same 3905 tag name as the token has been popped from the stack. */ 3906 while (true) { 3907 $node = end($this->stack)->nodeName; 3908 array_pop($this->stack); 3909 3910 if ($node === $token['name']) { 3911 break; 3912 } 3913 } 3914 3915 /* Clear the list of active formatting elements up to the last 3916 marker. */ 3917 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 3918 3919 /* Switch the insertion mode to "in row". (The current node 3920 will be a tr element at this point.) */ 3921 $this->mode = self::IN_ROW; 3922 } 3923 3924 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3925 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3926 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3927 $token['name'], 3928 array( 3929 'caption', 3930 'col', 3931 'colgroup', 3932 'tbody', 3933 'td', 3934 'tfoot', 3935 'th', 3936 'thead', 3937 'tr' 3938 ) 3939 ) 3940 ) { 3941 /* If the stack of open elements does not have a td or th element 3942 in table scope, then this is a parse error; ignore the token. 3943 (innerHTML case) */ 3944 if (!$this->elementInScope(array('td', 'th'), true)) { 3945 // Ignore. 3946 3947 /* Otherwise, close the cell (see below) and reprocess the current 3948 token. */ 3949 } else { 3950 $this->closeCell(); 3951 return $this->inRow($token); 3952 } 3953 3954 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3955 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3956 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3957 $token['name'], 3958 array( 3959 'caption', 3960 'col', 3961 'colgroup', 3962 'tbody', 3963 'td', 3964 'tfoot', 3965 'th', 3966 'thead', 3967 'tr' 3968 ) 3969 ) 3970 ) { 3971 /* If the stack of open elements does not have a td or th element 3972 in table scope, then this is a parse error; ignore the token. 3973 (innerHTML case) */ 3974 if (!$this->elementInScope(array('td', 'th'), true)) { 3975 // Ignore. 3976 3977 /* Otherwise, close the cell (see below) and reprocess the current 3978 token. */ 3979 } else { 3980 $this->closeCell(); 3981 return $this->inRow($token); 3982 } 3983 3984 /* An end tag whose tag name is one of: "body", "caption", "col", 3985 "colgroup", "html" */ 3986 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3987 $token['name'], 3988 array('body', 'caption', 'col', 'colgroup', 'html') 3989 ) 3990 ) { 3991 /* Parse error. Ignore the token. */ 3992 3993 /* An end tag whose tag name is one of: "table", "tbody", "tfoot", 3994 "thead", "tr" */ 3995 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3996 $token['name'], 3997 array('table', 'tbody', 'tfoot', 'thead', 'tr') 3998 ) 3999 ) { 4000 /* If the stack of open elements does not have an element in table 4001 scope with the same tag name as that of the token (which can only 4002 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), 4003 then this is a parse error and the token must be ignored. */ 4004 if (!$this->elementInScope($token['name'], true)) { 4005 // Ignore. 4006 4007 /* Otherwise, close the cell (see below) and reprocess the current 4008 token. */ 4009 } else { 4010 $this->closeCell(); 4011 return $this->inRow($token); 4012 } 4013 4014 /* Anything else */ 4015 } else { 4016 /* Process the token as if the insertion mode was "in body". */ 4017 $this->inBody($token); 4018 } 4019 } 4020 4021 private function inSelect($token) 4022 { 4023 /* Handle the token as follows: */ 4024 4025 /* A character token */ 4026 if ($token['type'] === HTML5::CHARACTR) { 4027 /* Append the token's character to the current node. */ 4028 $this->insertText($token['data']); 4029 4030 /* A comment token */ 4031 } elseif ($token['type'] === HTML5::COMMENT) { 4032 /* Append a Comment node to the current node with the data 4033 attribute set to the data given in the comment token. */ 4034 $this->insertComment($token['data']); 4035 4036 /* A start tag token whose tag name is "option" */ 4037 } elseif ($token['type'] === HTML5::STARTTAG && 4038 $token['name'] === 'option' 4039 ) { 4040 /* If the current node is an option element, act as if an end tag 4041 with the tag name "option" had been seen. */ 4042 if (end($this->stack)->nodeName === 'option') { 4043 $this->inSelect( 4044 array( 4045 'name' => 'option', 4046 'type' => HTML5::ENDTAG 4047 ) 4048 ); 4049 } 4050 4051 /* Insert an HTML element for the token. */ 4052 $this->insertElement($token); 4053 4054 /* A start tag token whose tag name is "optgroup" */ 4055 } elseif ($token['type'] === HTML5::STARTTAG && 4056 $token['name'] === 'optgroup' 4057 ) { 4058 /* If the current node is an option element, act as if an end tag 4059 with the tag name "option" had been seen. */ 4060 if (end($this->stack)->nodeName === 'option') { 4061 $this->inSelect( 4062 array( 4063 'name' => 'option', 4064 'type' => HTML5::ENDTAG 4065 ) 4066 ); 4067 } 4068 4069 /* If the current node is an optgroup element, act as if an end tag 4070 with the tag name "optgroup" had been seen. */ 4071 if (end($this->stack)->nodeName === 'optgroup') { 4072 $this->inSelect( 4073 array( 4074 'name' => 'optgroup', 4075 'type' => HTML5::ENDTAG 4076 ) 4077 ); 4078 } 4079 4080 /* Insert an HTML element for the token. */ 4081 $this->insertElement($token); 4082 4083 /* An end tag token whose tag name is "optgroup" */ 4084 } elseif ($token['type'] === HTML5::ENDTAG && 4085 $token['name'] === 'optgroup' 4086 ) { 4087 /* First, if the current node is an option element, and the node 4088 immediately before it in the stack of open elements is an optgroup 4089 element, then act as if an end tag with the tag name "option" had 4090 been seen. */ 4091 $elements_in_stack = count($this->stack); 4092 4093 if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' && 4094 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup' 4095 ) { 4096 $this->inSelect( 4097 array( 4098 'name' => 'option', 4099 'type' => HTML5::ENDTAG 4100 ) 4101 ); 4102 } 4103 4104 /* If the current node is an optgroup element, then pop that node 4105 from the stack of open elements. Otherwise, this is a parse error, 4106 ignore the token. */ 4107 if ($this->stack[$elements_in_stack - 1] === 'optgroup') { 4108 array_pop($this->stack); 4109 } 4110 4111 /* An end tag token whose tag name is "option" */ 4112 } elseif ($token['type'] === HTML5::ENDTAG && 4113 $token['name'] === 'option' 4114 ) { 4115 /* If the current node is an option element, then pop that node 4116 from the stack of open elements. Otherwise, this is a parse error, 4117 ignore the token. */ 4118 if (end($this->stack)->nodeName === 'option') { 4119 array_pop($this->stack); 4120 } 4121 4122 /* An end tag whose tag name is "select" */ 4123 } elseif ($token['type'] === HTML5::ENDTAG && 4124 $token['name'] === 'select' 4125 ) { 4126 /* If the stack of open elements does not have an element in table 4127 scope with the same tag name as the token, this is a parse error. 4128 Ignore the token. (innerHTML case) */ 4129 if (!$this->elementInScope($token['name'], true)) { 4130 // w/e 4131 4132 /* Otherwise: */ 4133 } else { 4134 /* Pop elements from the stack of open elements until a select 4135 element has been popped from the stack. */ 4136 while (true) { 4137 $current = end($this->stack)->nodeName; 4138 array_pop($this->stack); 4139 4140 if ($current === 'select') { 4141 break; 4142 } 4143 } 4144 4145 /* Reset the insertion mode appropriately. */ 4146 $this->resetInsertionMode(); 4147 } 4148 4149 /* A start tag whose tag name is "select" */ 4150 } elseif ($token['name'] === 'select' && 4151 $token['type'] === HTML5::STARTTAG 4152 ) { 4153 /* Parse error. Act as if the token had been an end tag with the 4154 tag name "select" instead. */ 4155 $this->inSelect( 4156 array( 4157 'name' => 'select', 4158 'type' => HTML5::ENDTAG 4159 ) 4160 ); 4161 4162 /* An end tag whose tag name is one of: "caption", "table", "tbody", 4163 "tfoot", "thead", "tr", "td", "th" */ 4164 } elseif (in_array( 4165 $token['name'], 4166 array( 4167 'caption', 4168 'table', 4169 'tbody', 4170 'tfoot', 4171 'thead', 4172 'tr', 4173 'td', 4174 'th' 4175 ) 4176 ) && $token['type'] === HTML5::ENDTAG 4177 ) { 4178 /* Parse error. */ 4179 // w/e 4180 4181 /* If the stack of open elements has an element in table scope with 4182 the same tag name as that of the token, then act as if an end tag 4183 with the tag name "select" had been seen, and reprocess the token. 4184 Otherwise, ignore the token. */ 4185 if ($this->elementInScope($token['name'], true)) { 4186 $this->inSelect( 4187 array( 4188 'name' => 'select', 4189 'type' => HTML5::ENDTAG 4190 ) 4191 ); 4192 4193 $this->mainPhase($token); 4194 } 4195 4196 /* Anything else */ 4197 } else { 4198 /* Parse error. Ignore the token. */ 4199 } 4200 } 4201 4202 private function afterBody($token) 4203 { 4204 /* Handle the token as follows: */ 4205 4206 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4207 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4208 or U+0020 SPACE */ 4209 if ($token['type'] === HTML5::CHARACTR && 4210 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4211 ) { 4212 /* Process the token as it would be processed if the insertion mode 4213 was "in body". */ 4214 $this->inBody($token); 4215 4216 /* A comment token */ 4217 } elseif ($token['type'] === HTML5::COMMENT) { 4218 /* Append a Comment node to the first element in the stack of open 4219 elements (the html element), with the data attribute set to the 4220 data given in the comment token. */ 4221 $comment = $this->dom->createComment($token['data']); 4222 $this->stack[0]->appendChild($comment); 4223 4224 /* An end tag with the tag name "html" */ 4225 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { 4226 /* If the parser was originally created in order to handle the 4227 setting of an element's innerHTML attribute, this is a parse error; 4228 ignore the token. (The element will be an html element in this 4229 case.) (innerHTML case) */ 4230 4231 /* Otherwise, switch to the trailing end phase. */ 4232 $this->phase = self::END_PHASE; 4233 4234 /* Anything else */ 4235 } else { 4236 /* Parse error. Set the insertion mode to "in body" and reprocess 4237 the token. */ 4238 $this->mode = self::IN_BODY; 4239 return $this->inBody($token); 4240 } 4241 } 4242 4243 private function inFrameset($token) 4244 { 4245 /* Handle the token as follows: */ 4246 4247 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4248 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4249 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 4250 if ($token['type'] === HTML5::CHARACTR && 4251 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4252 ) { 4253 /* Append the character to the current node. */ 4254 $this->insertText($token['data']); 4255 4256 /* A comment token */ 4257 } elseif ($token['type'] === HTML5::COMMENT) { 4258 /* Append a Comment node to the current node with the data 4259 attribute set to the data given in the comment token. */ 4260 $this->insertComment($token['data']); 4261 4262 /* A start tag with the tag name "frameset" */ 4263 } elseif ($token['name'] === 'frameset' && 4264 $token['type'] === HTML5::STARTTAG 4265 ) { 4266 $this->insertElement($token); 4267 4268 /* An end tag with the tag name "frameset" */ 4269 } elseif ($token['name'] === 'frameset' && 4270 $token['type'] === HTML5::ENDTAG 4271 ) { 4272 /* If the current node is the root html element, then this is a 4273 parse error; ignore the token. (innerHTML case) */ 4274 if (end($this->stack)->nodeName === 'html') { 4275 // Ignore 4276 4277 } else { 4278 /* Otherwise, pop the current node from the stack of open 4279 elements. */ 4280 array_pop($this->stack); 4281 4282 /* If the parser was not originally created in order to handle 4283 the setting of an element's innerHTML attribute (innerHTML case), 4284 and the current node is no longer a frameset element, then change 4285 the insertion mode to "after frameset". */ 4286 $this->mode = self::AFTR_FRAME; 4287 } 4288 4289 /* A start tag with the tag name "frame" */ 4290 } elseif ($token['name'] === 'frame' && 4291 $token['type'] === HTML5::STARTTAG 4292 ) { 4293 /* Insert an HTML element for the token. */ 4294 $this->insertElement($token); 4295 4296 /* Immediately pop the current node off the stack of open elements. */ 4297 array_pop($this->stack); 4298 4299 /* A start tag with the tag name "noframes" */ 4300 } elseif ($token['name'] === 'noframes' && 4301 $token['type'] === HTML5::STARTTAG 4302 ) { 4303 /* Process the token as if the insertion mode had been "in body". */ 4304 $this->inBody($token); 4305 4306 /* Anything else */ 4307 } else { 4308 /* Parse error. Ignore the token. */ 4309 } 4310 } 4311 4312 private function afterFrameset($token) 4313 { 4314 /* Handle the token as follows: */ 4315 4316 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4317 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4318 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 4319 if ($token['type'] === HTML5::CHARACTR && 4320 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4321 ) { 4322 /* Append the character to the current node. */ 4323 $this->insertText($token['data']); 4324 4325 /* A comment token */ 4326 } elseif ($token['type'] === HTML5::COMMENT) { 4327 /* Append a Comment node to the current node with the data 4328 attribute set to the data given in the comment token. */ 4329 $this->insertComment($token['data']); 4330 4331 /* An end tag with the tag name "html" */ 4332 } elseif ($token['name'] === 'html' && 4333 $token['type'] === HTML5::ENDTAG 4334 ) { 4335 /* Switch to the trailing end phase. */ 4336 $this->phase = self::END_PHASE; 4337 4338 /* A start tag with the tag name "noframes" */ 4339 } elseif ($token['name'] === 'noframes' && 4340 $token['type'] === HTML5::STARTTAG 4341 ) { 4342 /* Process the token as if the insertion mode had been "in body". */ 4343 $this->inBody($token); 4344 4345 /* Anything else */ 4346 } else { 4347 /* Parse error. Ignore the token. */ 4348 } 4349 } 4350 4351 private function trailingEndPhase($token) 4352 { 4353 /* After the main phase, as each token is emitted from the tokenisation 4354 stage, it must be processed as described in this section. */ 4355 4356 /* A DOCTYPE token */ 4357 if ($token['type'] === HTML5::DOCTYPE) { 4358 // Parse error. Ignore the token. 4359 4360 /* A comment token */ 4361 } elseif ($token['type'] === HTML5::COMMENT) { 4362 /* Append a Comment node to the Document object with the data 4363 attribute set to the data given in the comment token. */ 4364 $comment = $this->dom->createComment($token['data']); 4365 $this->dom->appendChild($comment); 4366 4367 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4368 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4369 or U+0020 SPACE */ 4370 } elseif ($token['type'] === HTML5::CHARACTR && 4371 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4372 ) { 4373 /* Process the token as it would be processed in the main phase. */ 4374 $this->mainPhase($token); 4375 4376 /* A character token that is not one of U+0009 CHARACTER TABULATION, 4377 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4378 or U+0020 SPACE. Or a start tag token. Or an end tag token. */ 4379 } elseif (($token['type'] === HTML5::CHARACTR && 4380 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 4381 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG 4382 ) { 4383 /* Parse error. Switch back to the main phase and reprocess the 4384 token. */ 4385 $this->phase = self::MAIN_PHASE; 4386 return $this->mainPhase($token); 4387 4388 /* An end-of-file token */ 4389 } elseif ($token['type'] === HTML5::EOF) { 4390 /* OMG DONE!! */ 4391 } 4392 } 4393 4394 private function insertElement($token, $append = true, $check = false) 4395 { 4396 // Proprietary workaround for libxml2's limitations with tag names 4397 if ($check) { 4398 // Slightly modified HTML5 tag-name modification, 4399 // removing anything that's not an ASCII letter, digit, or hyphen 4400 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); 4401 // Remove leading hyphens and numbers 4402 $token['name'] = ltrim($token['name'], '-0..9'); 4403 // In theory, this should ever be needed, but just in case 4404 if ($token['name'] === '') { 4405 $token['name'] = 'span'; 4406 } // arbitrary generic choice 4407 } 4408 4409 $el = $this->dom->createElement($token['name']); 4410 4411 foreach ($token['attr'] as $attr) { 4412 if (!$el->hasAttribute($attr['name'])) { 4413 $el->setAttribute($attr['name'], $attr['value']); 4414 } 4415 } 4416 4417 $this->appendToRealParent($el); 4418 $this->stack[] = $el; 4419 4420 return $el; 4421 } 4422 4423 private function insertText($data) 4424 { 4425 $text = $this->dom->createTextNode($data); 4426 $this->appendToRealParent($text); 4427 } 4428 4429 private function insertComment($data) 4430 { 4431 $comment = $this->dom->createComment($data); 4432 $this->appendToRealParent($comment); 4433 } 4434 4435 private function appendToRealParent($node) 4436 { 4437 if ($this->foster_parent === null) { 4438 end($this->stack)->appendChild($node); 4439 4440 } elseif ($this->foster_parent !== null) { 4441 /* If the foster parent element is the parent element of the 4442 last table element in the stack of open elements, then the new 4443 node must be inserted immediately before the last table element 4444 in the stack of open elements in the foster parent element; 4445 otherwise, the new node must be appended to the foster parent 4446 element. */ 4447 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 4448 if ($this->stack[$n]->nodeName === 'table' && 4449 $this->stack[$n]->parentNode !== null 4450 ) { 4451 $table = $this->stack[$n]; 4452 break; 4453 } 4454 } 4455 4456 if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) { 4457 $this->foster_parent->insertBefore($node, $table); 4458 } else { 4459 $this->foster_parent->appendChild($node); 4460 } 4461 4462 $this->foster_parent = null; 4463 } 4464 } 4465 4466 private function elementInScope($el, $table = false) 4467 { 4468 if (is_array($el)) { 4469 foreach ($el as $element) { 4470 if ($this->elementInScope($element, $table)) { 4471 return true; 4472 } 4473 } 4474 4475 return false; 4476 } 4477 4478 $leng = count($this->stack); 4479 4480 for ($n = 0; $n < $leng; $n++) { 4481 /* 1. Initialise node to be the current node (the bottommost node of 4482 the stack). */ 4483 $node = $this->stack[$leng - 1 - $n]; 4484 4485 if ($node->tagName === $el) { 4486 /* 2. If node is the target node, terminate in a match state. */ 4487 return true; 4488 4489 } elseif ($node->tagName === 'table') { 4490 /* 3. Otherwise, if node is a table element, terminate in a failure 4491 state. */ 4492 return false; 4493 4494 } elseif ($table === true && in_array( 4495 $node->tagName, 4496 array( 4497 'caption', 4498 'td', 4499 'th', 4500 'button', 4501 'marquee', 4502 'object' 4503 ) 4504 ) 4505 ) { 4506 /* 4. Otherwise, if the algorithm is the "has an element in scope" 4507 variant (rather than the "has an element in table scope" variant), 4508 and node is one of the following, terminate in a failure state. */ 4509 return false; 4510 4511 } elseif ($node === $node->ownerDocument->documentElement) { 4512 /* 5. Otherwise, if node is an html element (root element), terminate 4513 in a failure state. (This can only happen if the node is the topmost 4514 node of the stack of open elements, and prevents the next step from 4515 being invoked if there are no more elements in the stack.) */ 4516 return false; 4517 } 4518 4519 /* Otherwise, set node to the previous entry in the stack of open 4520 elements and return to step 2. (This will never fail, since the loop 4521 will always terminate in the previous step if the top of the stack 4522 is reached.) */ 4523 } 4524 } 4525 4526 private function reconstructActiveFormattingElements() 4527 { 4528 /* 1. If there are no entries in the list of active formatting elements, 4529 then there is nothing to reconstruct; stop this algorithm. */ 4530 $formatting_elements = count($this->a_formatting); 4531 4532 if ($formatting_elements === 0) { 4533 return false; 4534 } 4535 4536 /* 3. Let entry be the last (most recently added) element in the list 4537 of active formatting elements. */ 4538 $entry = end($this->a_formatting); 4539 4540 /* 2. If the last (most recently added) entry in the list of active 4541 formatting elements is a marker, or if it is an element that is in the 4542 stack of open elements, then there is nothing to reconstruct; stop this 4543 algorithm. */ 4544 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { 4545 return false; 4546 } 4547 4548 for ($a = $formatting_elements - 1; $a >= 0; true) { 4549 /* 4. If there are no entries before entry in the list of active 4550 formatting elements, then jump to step 8. */ 4551 if ($a === 0) { 4552 $step_seven = false; 4553 break; 4554 } 4555 4556 /* 5. Let entry be the entry one earlier than entry in the list of 4557 active formatting elements. */ 4558 $a--; 4559 $entry = $this->a_formatting[$a]; 4560 4561 /* 6. If entry is neither a marker nor an element that is also in 4562 thetack of open elements, go to step 4. */ 4563 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { 4564 break; 4565 } 4566 } 4567 4568 while (true) { 4569 /* 7. Let entry be the element one later than entry in the list of 4570 active formatting elements. */ 4571 if (isset($step_seven) && $step_seven === true) { 4572 $a++; 4573 $entry = $this->a_formatting[$a]; 4574 } 4575 4576 /* 8. Perform a shallow clone of the element entry to obtain clone. */ 4577 $clone = $entry->cloneNode(); 4578 4579 /* 9. Append clone to the current node and push it onto the stack 4580 of open elements so that it is the new current node. */ 4581 end($this->stack)->appendChild($clone); 4582 $this->stack[] = $clone; 4583 4584 /* 10. Replace the entry for entry in the list with an entry for 4585 clone. */ 4586 $this->a_formatting[$a] = $clone; 4587 4588 /* 11. If the entry for clone in the list of active formatting 4589 elements is not the last entry in the list, return to step 7. */ 4590 if (end($this->a_formatting) !== $clone) { 4591 $step_seven = true; 4592 } else { 4593 break; 4594 } 4595 } 4596 } 4597 4598 private function clearTheActiveFormattingElementsUpToTheLastMarker() 4599 { 4600 /* When the steps below require the UA to clear the list of active 4601 formatting elements up to the last marker, the UA must perform the 4602 following steps: */ 4603 4604 while (true) { 4605 /* 1. Let entry be the last (most recently added) entry in the list 4606 of active formatting elements. */ 4607 $entry = end($this->a_formatting); 4608 4609 /* 2. Remove entry from the list of active formatting elements. */ 4610 array_pop($this->a_formatting); 4611 4612 /* 3. If entry was a marker, then stop the algorithm at this point. 4613 The list has been cleared up to the last marker. */ 4614 if ($entry === self::MARKER) { 4615 break; 4616 } 4617 } 4618 } 4619 4620 private function generateImpliedEndTags($exclude = array()) 4621 { 4622 /* When the steps below require the UA to generate implied end tags, 4623 then, if the current node is a dd element, a dt element, an li element, 4624 a p element, a td element, a th element, or a tr element, the UA must 4625 act as if an end tag with the respective tag name had been seen and 4626 then generate implied end tags again. */ 4627 $node = end($this->stack); 4628 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); 4629 4630 while (in_array(end($this->stack)->nodeName, $elements)) { 4631 array_pop($this->stack); 4632 } 4633 } 4634 4635 private function getElementCategory($node) 4636 { 4637 $name = $node->tagName; 4638 if (in_array($name, $this->special)) { 4639 return self::SPECIAL; 4640 } elseif (in_array($name, $this->scoping)) { 4641 return self::SCOPING; 4642 } elseif (in_array($name, $this->formatting)) { 4643 return self::FORMATTING; 4644 } else { 4645 return self::PHRASING; 4646 } 4647 } 4648 4649 private function clearStackToTableContext($elements) 4650 { 4651 /* When the steps above require the UA to clear the stack back to a 4652 table context, it means that the UA must, while the current node is not 4653 a table element or an html element, pop elements from the stack of open 4654 elements. If this causes any elements to be popped from the stack, then 4655 this is a parse error. */ 4656 while (true) { 4657 $node = end($this->stack)->nodeName; 4658 4659 if (in_array($node, $elements)) { 4660 break; 4661 } else { 4662 array_pop($this->stack); 4663 } 4664 } 4665 } 4666 4667 private function resetInsertionMode() 4668 { 4669 /* 1. Let last be false. */ 4670 $last = false; 4671 $leng = count($this->stack); 4672 4673 for ($n = $leng - 1; $n >= 0; $n--) { 4674 /* 2. Let node be the last node in the stack of open elements. */ 4675 $node = $this->stack[$n]; 4676 4677 /* 3. If node is the first node in the stack of open elements, then 4678 set last to true. If the element whose innerHTML attribute is being 4679 set is neither a td element nor a th element, then set node to the 4680 element whose innerHTML attribute is being set. (innerHTML case) */ 4681 if ($this->stack[0]->isSameNode($node)) { 4682 $last = true; 4683 } 4684 4685 /* 4. If node is a select element, then switch the insertion mode to 4686 "in select" and abort these steps. (innerHTML case) */ 4687 if ($node->nodeName === 'select') { 4688 $this->mode = self::IN_SELECT; 4689 break; 4690 4691 /* 5. If node is a td or th element, then switch the insertion mode 4692 to "in cell" and abort these steps. */ 4693 } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') { 4694 $this->mode = self::IN_CELL; 4695 break; 4696 4697 /* 6. If node is a tr element, then switch the insertion mode to 4698 "in row" and abort these steps. */ 4699 } elseif ($node->nodeName === 'tr') { 4700 $this->mode = self::IN_ROW; 4701 break; 4702 4703 /* 7. If node is a tbody, thead, or tfoot element, then switch the 4704 insertion mode to "in table body" and abort these steps. */ 4705 } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { 4706 $this->mode = self::IN_TBODY; 4707 break; 4708 4709 /* 8. If node is a caption element, then switch the insertion mode 4710 to "in caption" and abort these steps. */ 4711 } elseif ($node->nodeName === 'caption') { 4712 $this->mode = self::IN_CAPTION; 4713 break; 4714 4715 /* 9. If node is a colgroup element, then switch the insertion mode 4716 to "in column group" and abort these steps. (innerHTML case) */ 4717 } elseif ($node->nodeName === 'colgroup') { 4718 $this->mode = self::IN_CGROUP; 4719 break; 4720 4721 /* 10. If node is a table element, then switch the insertion mode 4722 to "in table" and abort these steps. */ 4723 } elseif ($node->nodeName === 'table') { 4724 $this->mode = self::IN_TABLE; 4725 break; 4726 4727 /* 11. If node is a head element, then switch the insertion mode 4728 to "in body" ("in body"! not "in head"!) and abort these steps. 4729 (innerHTML case) */ 4730 } elseif ($node->nodeName === 'head') { 4731 $this->mode = self::IN_BODY; 4732 break; 4733 4734 /* 12. If node is a body element, then switch the insertion mode to 4735 "in body" and abort these steps. */ 4736 } elseif ($node->nodeName === 'body') { 4737 $this->mode = self::IN_BODY; 4738 break; 4739 4740 /* 13. If node is a frameset element, then switch the insertion 4741 mode to "in frameset" and abort these steps. (innerHTML case) */ 4742 } elseif ($node->nodeName === 'frameset') { 4743 $this->mode = self::IN_FRAME; 4744 break; 4745 4746 /* 14. If node is an html element, then: if the head element 4747 pointer is null, switch the insertion mode to "before head", 4748 otherwise, switch the insertion mode to "after head". In either 4749 case, abort these steps. (innerHTML case) */ 4750 } elseif ($node->nodeName === 'html') { 4751 $this->mode = ($this->head_pointer === null) 4752 ? self::BEFOR_HEAD 4753 : self::AFTER_HEAD; 4754 4755 break; 4756 4757 /* 15. If last is true, then set the insertion mode to "in body" 4758 and abort these steps. (innerHTML case) */ 4759 } elseif ($last) { 4760 $this->mode = self::IN_BODY; 4761 break; 4762 } 4763 } 4764 } 4765 4766 private function closeCell() 4767 { 4768 /* If the stack of open elements has a td or th element in table scope, 4769 then act as if an end tag token with that tag name had been seen. */ 4770 foreach (array('td', 'th') as $cell) { 4771 if ($this->elementInScope($cell, true)) { 4772 $this->inCell( 4773 array( 4774 'name' => $cell, 4775 'type' => HTML5::ENDTAG 4776 ) 4777 ); 4778 4779 break; 4780 } 4781 } 4782 } 4783 4784 public function save() 4785 { 4786 return $this->dom; 4787 } 4788 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body