Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.2.x will end 22 April 2024 (12 months).
  • Bug fixes for security issues in 4.2.x will end 7 October 2024 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.1.x is supported too.

Differences Between: [Versions 310 and 402] [Versions 311 and 402] [Versions 39 and 402] [Versions 400 and 402]

   1  <?php
   2  
   3  /**
   4   * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
   5   * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
   6   *
   7   * @note
   8   *    Recent changes to PHP's DOM extension have resulted in some fatal
   9   *    error conditions with the original version of PH5P. Pending changes,
  10   *    this lexer will punt to DirectLex if DOM throws an exception.
  11   */
  12  
  13  class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
  14  {
  15      /**
  16       * @param string $html
  17       * @param HTMLPurifier_Config $config
  18       * @param HTMLPurifier_Context $context
  19       * @return HTMLPurifier_Token[]
  20       */
  21      public function tokenizeHTML($html, $config, $context)
  22      {
  23          $new_html = $this->normalize($html, $config, $context);
  24          $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
  25          try {
  26              $parser = new HTML5($new_html);
  27              $doc = $parser->save();
  28          } catch (DOMException $e) {
  29              // Uh oh, it failed. Punt to DirectLex.
  30              $lexer = new HTMLPurifier_Lexer_DirectLex();
  31              $context->register('PH5PError', $e); // save the error, so we can detect it
  32              return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
  33          }
  34          $tokens = array();
  35          $this->tokenizeDOM(
  36              $doc->getElementsByTagName('html')->item(0)-> // <html>
  37                    getElementsByTagName('body')->item(0) //   <body>
  38              ,
  39              $tokens, $config
  40          );
  41          return $tokens;
  42      }
  43  }
  44  
  45  /*
  46  
  47  Copyright 2007 Jeroen van der Meer <http://jero.net/>
  48  
  49  Permission is hereby granted, free of charge, to any person obtaining a
  50  copy of this software and associated documentation files (the
  51  "Software"), to deal in the Software without restriction, including
  52  without limitation the rights to use, copy, modify, merge, publish,
  53  distribute, sublicense, and/or sell copies of the Software, and to
  54  permit persons to whom the Software is furnished to do so, subject to
  55  the following conditions:
  56  
  57  The above copyright notice and this permission notice shall be included
  58  in all copies or substantial portions of the Software.
  59  
  60  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  61  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  62  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  63  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  64  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  65  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  66  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  67  
  68  */
  69  
  70  class HTML5
  71  {
  72      private $data;
  73      private $char;
  74      private $EOF;
  75      private $state;
  76      private $tree;
  77      private $token;
  78      private $content_model;
  79      private $escape = false;
  80      private $entities = array(
  81          'AElig;',
  82          'AElig',
  83          'AMP;',
  84          'AMP',
  85          'Aacute;',
  86          'Aacute',
  87          'Acirc;',
  88          'Acirc',
  89          'Agrave;',
  90          'Agrave',
  91          'Alpha;',
  92          'Aring;',
  93          'Aring',
  94          'Atilde;',
  95          'Atilde',
  96          'Auml;',
  97          'Auml',
  98          'Beta;',
  99          'COPY;',
 100          'COPY',
 101          'Ccedil;',
 102          'Ccedil',
 103          'Chi;',
 104          'Dagger;',
 105          'Delta;',
 106          'ETH;',
 107          'ETH',
 108          'Eacute;',
 109          'Eacute',
 110          'Ecirc;',
 111          'Ecirc',
 112          'Egrave;',
 113          'Egrave',
 114          'Epsilon;',
 115          'Eta;',
 116          'Euml;',
 117          'Euml',
 118          'GT;',
 119          'GT',
 120          'Gamma;',
 121          'Iacute;',
 122          'Iacute',
 123          'Icirc;',
 124          'Icirc',
 125          'Igrave;',
 126          'Igrave',
 127          'Iota;',
 128          'Iuml;',
 129          'Iuml',
 130          'Kappa;',
 131          'LT;',
 132          'LT',
 133          'Lambda;',
 134          'Mu;',
 135          'Ntilde;',
 136          'Ntilde',
 137          'Nu;',
 138          'OElig;',
 139          'Oacute;',
 140          'Oacute',
 141          'Ocirc;',
 142          'Ocirc',
 143          'Ograve;',
 144          'Ograve',
 145          'Omega;',
 146          'Omicron;',
 147          'Oslash;',
 148          'Oslash',
 149          'Otilde;',
 150          'Otilde',
 151          'Ouml;',
 152          'Ouml',
 153          'Phi;',
 154          'Pi;',
 155          'Prime;',
 156          'Psi;',
 157          'QUOT;',
 158          'QUOT',
 159          'REG;',
 160          'REG',
 161          'Rho;',
 162          'Scaron;',
 163          'Sigma;',
 164          'THORN;',
 165          'THORN',
 166          'TRADE;',
 167          'Tau;',
 168          'Theta;',
 169          'Uacute;',
 170          'Uacute',
 171          'Ucirc;',
 172          'Ucirc',
 173          'Ugrave;',
 174          'Ugrave',
 175          'Upsilon;',
 176          'Uuml;',
 177          'Uuml',
 178          'Xi;',
 179          'Yacute;',
 180          'Yacute',
 181          'Yuml;',
 182          'Zeta;',
 183          'aacute;',
 184          'aacute',
 185          'acirc;',
 186          'acirc',
 187          'acute;',
 188          'acute',
 189          'aelig;',
 190          'aelig',
 191          'agrave;',
 192          'agrave',
 193          'alefsym;',
 194          'alpha;',
 195          'amp;',
 196          'amp',
 197          'and;',
 198          'ang;',
 199          'apos;',
 200          'aring;',
 201          'aring',
 202          'asymp;',
 203          'atilde;',
 204          'atilde',
 205          'auml;',
 206          'auml',
 207          'bdquo;',
 208          'beta;',
 209          'brvbar;',
 210          'brvbar',
 211          'bull;',
 212          'cap;',
 213          'ccedil;',
 214          'ccedil',
 215          'cedil;',
 216          'cedil',
 217          'cent;',
 218          'cent',
 219          'chi;',
 220          'circ;',
 221          'clubs;',
 222          'cong;',
 223          'copy;',
 224          'copy',
 225          'crarr;',
 226          'cup;',
 227          'curren;',
 228          'curren',
 229          'dArr;',
 230          'dagger;',
 231          'darr;',
 232          'deg;',
 233          'deg',
 234          'delta;',
 235          'diams;',
 236          'divide;',
 237          'divide',
 238          'eacute;',
 239          'eacute',
 240          'ecirc;',
 241          'ecirc',
 242          'egrave;',
 243          'egrave',
 244          'empty;',
 245          'emsp;',
 246          'ensp;',
 247          'epsilon;',
 248          'equiv;',
 249          'eta;',
 250          'eth;',
 251          'eth',
 252          'euml;',
 253          'euml',
 254          'euro;',
 255          'exist;',
 256          'fnof;',
 257          'forall;',
 258          'frac12;',
 259          'frac12',
 260          'frac14;',
 261          'frac14',
 262          'frac34;',
 263          'frac34',
 264          'frasl;',
 265          'gamma;',
 266          'ge;',
 267          'gt;',
 268          'gt',
 269          'hArr;',
 270          'harr;',
 271          'hearts;',
 272          'hellip;',
 273          'iacute;',
 274          'iacute',
 275          'icirc;',
 276          'icirc',
 277          'iexcl;',
 278          'iexcl',
 279          'igrave;',
 280          'igrave',
 281          'image;',
 282          'infin;',
 283          'int;',
 284          'iota;',
 285          'iquest;',
 286          'iquest',
 287          'isin;',
 288          'iuml;',
 289          'iuml',
 290          'kappa;',
 291          'lArr;',
 292          'lambda;',
 293          'lang;',
 294          'laquo;',
 295          'laquo',
 296          'larr;',
 297          'lceil;',
 298          'ldquo;',
 299          'le;',
 300          'lfloor;',
 301          'lowast;',
 302          'loz;',
 303          'lrm;',
 304          'lsaquo;',
 305          'lsquo;',
 306          'lt;',
 307          'lt',
 308          'macr;',
 309          'macr',
 310          'mdash;',
 311          'micro;',
 312          'micro',
 313          'middot;',
 314          'middot',
 315          'minus;',
 316          'mu;',
 317          'nabla;',
 318          'nbsp;',
 319          'nbsp',
 320          'ndash;',
 321          'ne;',
 322          'ni;',
 323          'not;',
 324          'not',
 325          'notin;',
 326          'nsub;',
 327          'ntilde;',
 328          'ntilde',
 329          'nu;',
 330          'oacute;',
 331          'oacute',
 332          'ocirc;',
 333          'ocirc',
 334          'oelig;',
 335          'ograve;',
 336          'ograve',
 337          'oline;',
 338          'omega;',
 339          'omicron;',
 340          'oplus;',
 341          'or;',
 342          'ordf;',
 343          'ordf',
 344          'ordm;',
 345          'ordm',
 346          'oslash;',
 347          'oslash',
 348          'otilde;',
 349          'otilde',
 350          'otimes;',
 351          'ouml;',
 352          'ouml',
 353          'para;',
 354          'para',
 355          'part;',
 356          'permil;',
 357          'perp;',
 358          'phi;',
 359          'pi;',
 360          'piv;',
 361          'plusmn;',
 362          'plusmn',
 363          'pound;',
 364          'pound',
 365          'prime;',
 366          'prod;',
 367          'prop;',
 368          'psi;',
 369          'quot;',
 370          'quot',
 371          'rArr;',
 372          'radic;',
 373          'rang;',
 374          'raquo;',
 375          'raquo',
 376          'rarr;',
 377          'rceil;',
 378          'rdquo;',
 379          'real;',
 380          'reg;',
 381          'reg',
 382          'rfloor;',
 383          'rho;',
 384          'rlm;',
 385          'rsaquo;',
 386          'rsquo;',
 387          'sbquo;',
 388          'scaron;',
 389          'sdot;',
 390          'sect;',
 391          'sect',
 392          'shy;',
 393          'shy',
 394          'sigma;',
 395          'sigmaf;',
 396          'sim;',
 397          'spades;',
 398          'sub;',
 399          'sube;',
 400          'sum;',
 401          'sup1;',
 402          'sup1',
 403          'sup2;',
 404          'sup2',
 405          'sup3;',
 406          'sup3',
 407          'sup;',
 408          'supe;',
 409          'szlig;',
 410          'szlig',
 411          'tau;',
 412          'there4;',
 413          'theta;',
 414          'thetasym;',
 415          'thinsp;',
 416          'thorn;',
 417          'thorn',
 418          'tilde;',
 419          'times;',
 420          'times',
 421          'trade;',
 422          'uArr;',
 423          'uacute;',
 424          'uacute',
 425          'uarr;',
 426          'ucirc;',
 427          'ucirc',
 428          'ugrave;',
 429          'ugrave',
 430          'uml;',
 431          'uml',
 432          'upsih;',
 433          'upsilon;',
 434          'uuml;',
 435          'uuml',
 436          'weierp;',
 437          'xi;',
 438          'yacute;',
 439          'yacute',
 440          'yen;',
 441          'yen',
 442          'yuml;',
 443          'yuml',
 444          'zeta;',
 445          'zwj;',
 446          'zwnj;'
 447      );
 448  
 449      const PCDATA = 0;
 450      const RCDATA = 1;
 451      const CDATA = 2;
 452      const PLAINTEXT = 3;
 453  
 454      const DOCTYPE = 0;
 455      const STARTTAG = 1;
 456      const ENDTAG = 2;
 457      const COMMENT = 3;
 458      const CHARACTR = 4;
 459      const EOF = 5;
 460  
 461      public function __construct($data)
 462      {
 463          $this->data = $data;
 464          $this->char = -1;
 465          $this->EOF = strlen($data);
 466          $this->tree = new HTML5TreeConstructer;
 467          $this->content_model = self::PCDATA;
 468  
 469          $this->state = 'data';
 470  
 471          while ($this->state !== null) {
 472              $this->{$this->state . 'State'}();
 473          }
 474      }
 475  
 476      public function save()
 477      {
 478          return $this->tree->save();
 479      }
 480  
 481      private function char()
 482      {
 483          return ($this->char < $this->EOF)
 484              ? $this->data[$this->char]
 485              : false;
 486      }
 487  
 488      private function character($s, $l = 0)
 489      {
 490          if ($s + $l < $this->EOF) {
 491              if ($l === 0) {
 492                  return $this->data[$s];
 493              } else {
 494                  return substr($this->data, $s, $l);
 495              }
 496          }
 497      }
 498  
 499      private function characters($char_class, $start)
 500      {
 501          return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
 502      }
 503  
 504      private function dataState()
 505      {
 506          // Consume the next input character
 507          $this->char++;
 508          $char = $this->char();
 509  
 510          if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
 511              /* U+0026 AMPERSAND (&)
 512              When the content model flag is set to one of the PCDATA or RCDATA
 513              states: switch to the entity data state. Otherwise: treat it as per
 514              the "anything else"    entry below. */
 515              $this->state = 'entityData';
 516  
 517          } elseif ($char === '-') {
 518              /* If the content model flag is set to either the RCDATA state or
 519              the CDATA state, and the escape flag is false, and there are at
 520              least three characters before this one in the input stream, and the
 521              last four characters in the input stream, including this one, are
 522              U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
 523              and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 524              if (($this->content_model === self::RCDATA || $this->content_model ===
 525                      self::CDATA) && $this->escape === false &&
 526                  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
 527              ) {
 528                  $this->escape = true;
 529              }
 530  
 531              /* In any case, emit the input character as a character token. Stay
 532              in the data state. */
 533              $this->emitToken(
 534                  array(
 535                      'type' => self::CHARACTR,
 536                      'data' => $char
 537                  )
 538              );
 539  
 540              /* U+003C LESS-THAN SIGN (<) */
 541          } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
 542                  (($this->content_model === self::RCDATA ||
 543                          $this->content_model === self::CDATA) && $this->escape === false))
 544          ) {
 545              /* When the content model flag is set to the PCDATA state: switch
 546              to the tag open state.
 547  
 548              When the content model flag is set to either the RCDATA state or
 549              the CDATA state and the escape flag is false: switch to the tag
 550              open state.
 551  
 552              Otherwise: treat it as per the "anything else" entry below. */
 553              $this->state = 'tagOpen';
 554  
 555              /* U+003E GREATER-THAN SIGN (>) */
 556          } elseif ($char === '>') {
 557              /* If the content model flag is set to either the RCDATA state or
 558              the CDATA state, and the escape flag is true, and the last three
 559              characters in the input stream including this one are U+002D
 560              HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
 561              set the escape flag to false. */
 562              if (($this->content_model === self::RCDATA ||
 563                      $this->content_model === self::CDATA) && $this->escape === true &&
 564                  $this->character($this->char, 3) === '-->'
 565              ) {
 566                  $this->escape = false;
 567              }
 568  
 569              /* In any case, emit the input character as a character token.
 570              Stay in the data state. */
 571              $this->emitToken(
 572                  array(
 573                      'type' => self::CHARACTR,
 574                      'data' => $char
 575                  )
 576              );
 577  
 578          } elseif ($this->char === $this->EOF) {
 579              /* EOF
 580              Emit an end-of-file token. */
 581              $this->EOF();
 582  
 583          } elseif ($this->content_model === self::PLAINTEXT) {
 584              /* When the content model flag is set to the PLAINTEXT state
 585              THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
 586              the text and emit it as a character token. */
 587              $this->emitToken(
 588                  array(
 589                      'type' => self::CHARACTR,
 590                      'data' => substr($this->data, $this->char)
 591                  )
 592              );
 593  
 594              $this->EOF();
 595  
 596          } else {
 597              /* Anything else
 598              THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
 599              otherwise would also be treated as a character token and emit it
 600              as a single character token. Stay in the data state. */
 601              $len = strcspn($this->data, '<&', $this->char);
 602              $char = substr($this->data, $this->char, $len);
 603              $this->char += $len - 1;
 604  
 605              $this->emitToken(
 606                  array(
 607                      'type' => self::CHARACTR,
 608                      'data' => $char
 609                  )
 610              );
 611  
 612              $this->state = 'data';
 613          }
 614      }
 615  
 616      private function entityDataState()
 617      {
 618          // Attempt to consume an entity.
 619          $entity = $this->entity();
 620  
 621          // If nothing is returned, emit a U+0026 AMPERSAND character token.
 622          // Otherwise, emit the character token that was returned.
 623          $char = (!$entity) ? '&' : $entity;
 624          $this->emitToken(
 625              array(
 626                  'type' => self::CHARACTR,
 627                  'data' => $char
 628              )
 629          );
 630  
 631          // Finally, switch to the data state.
 632          $this->state = 'data';
 633      }
 634  
 635      private function tagOpenState()
 636      {
 637          switch ($this->content_model) {
 638              case self::RCDATA:
 639              case self::CDATA:
 640                  /* If the next input character is a U+002F SOLIDUS (/) character,
 641                  consume it and switch to the close tag open state. If the next
 642                  input character is not a U+002F SOLIDUS (/) character, emit a
 643                  U+003C LESS-THAN SIGN character token and switch to the data
 644                  state to process the next input character. */
 645                  if ($this->character($this->char + 1) === '/') {
 646                      $this->char++;
 647                      $this->state = 'closeTagOpen';
 648  
 649                  } else {
 650                      $this->emitToken(
 651                          array(
 652                              'type' => self::CHARACTR,
 653                              'data' => '<'
 654                          )
 655                      );
 656  
 657                      $this->state = 'data';
 658                  }
 659                  break;
 660  
 661              case self::PCDATA:
 662                  // If the content model flag is set to the PCDATA state
 663                  // Consume the next input character:
 664                  $this->char++;
 665                  $char = $this->char();
 666  
 667                  if ($char === '!') {
 668                      /* U+0021 EXCLAMATION MARK (!)
 669                      Switch to the markup declaration open state. */
 670                      $this->state = 'markupDeclarationOpen';
 671  
 672                  } elseif ($char === '/') {
 673                      /* U+002F SOLIDUS (/)
 674                      Switch to the close tag open state. */
 675                      $this->state = 'closeTagOpen';
 676  
 677                  } elseif (preg_match('/^[A-Za-z]$/', $char)) {
 678                      /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 679                      Create a new start tag token, set its tag name to the lowercase
 680                      version of the input character (add 0x0020 to the character's code
 681                      point), then switch to the tag name state. (Don't emit the token
 682                      yet; further details will be filled in before it is emitted.) */
 683                      $this->token = array(
 684                          'name' => strtolower($char),
 685                          'type' => self::STARTTAG,
 686                          'attr' => array()
 687                      );
 688  
 689                      $this->state = 'tagName';
 690  
 691                  } elseif ($char === '>') {
 692                      /* U+003E GREATER-THAN SIGN (>)
 693                      Parse error. Emit a U+003C LESS-THAN SIGN character token and a
 694                      U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 695                      $this->emitToken(
 696                          array(
 697                              'type' => self::CHARACTR,
 698                              'data' => '<>'
 699                          )
 700                      );
 701  
 702                      $this->state = 'data';
 703  
 704                  } elseif ($char === '?') {
 705                      /* U+003F QUESTION MARK (?)
 706                      Parse error. Switch to the bogus comment state. */
 707                      $this->state = 'bogusComment';
 708  
 709                  } else {
 710                      /* Anything else
 711                      Parse error. Emit a U+003C LESS-THAN SIGN character token and
 712                      reconsume the current input character in the data state. */
 713                      $this->emitToken(
 714                          array(
 715                              'type' => self::CHARACTR,
 716                              'data' => '<'
 717                          )
 718                      );
 719  
 720                      $this->char--;
 721                      $this->state = 'data';
 722                  }
 723                  break;
 724          }
 725      }
 726  
 727      private function closeTagOpenState()
 728      {
 729          $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
 730          $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
 731  
 732          if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
 733              (!$the_same || ($the_same && (!preg_match(
 734                              '/[\t\n\x0b\x0c >\/]/',
 735                              $this->character($this->char + 1 + strlen($next_node))
 736                          ) || $this->EOF === $this->char)))
 737          ) {
 738              /* If the content model flag is set to the RCDATA or CDATA states then
 739              examine the next few characters. If they do not match the tag name of
 740              the last start tag token emitted (case insensitively), or if they do but
 741              they are not immediately followed by one of the following characters:
 742                  * U+0009 CHARACTER TABULATION
 743                  * U+000A LINE FEED (LF)
 744                  * U+000B LINE TABULATION
 745                  * U+000C FORM FEED (FF)
 746                  * U+0020 SPACE
 747                  * U+003E GREATER-THAN SIGN (>)
 748                  * U+002F SOLIDUS (/)
 749                  * EOF
 750              ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
 751              token, a U+002F SOLIDUS character token, and switch to the data state
 752              to process the next input character. */
 753              $this->emitToken(
 754                  array(
 755                      'type' => self::CHARACTR,
 756                      'data' => '</'
 757                  )
 758              );
 759  
 760              $this->state = 'data';
 761  
 762          } else {
 763              /* Otherwise, if the content model flag is set to the PCDATA state,
 764              or if the next few characters do match that tag name, consume the
 765              next input character: */
 766              $this->char++;
 767              $char = $this->char();
 768  
 769              if (preg_match('/^[A-Za-z]$/', $char)) {
 770                  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 771                  Create a new end tag token, set its tag name to the lowercase version
 772                  of the input character (add 0x0020 to the character's code point), then
 773                  switch to the tag name state. (Don't emit the token yet; further details
 774                  will be filled in before it is emitted.) */
 775                  $this->token = array(
 776                      'name' => strtolower($char),
 777                      'type' => self::ENDTAG
 778                  );
 779  
 780                  $this->state = 'tagName';
 781  
 782              } elseif ($char === '>') {
 783                  /* U+003E GREATER-THAN SIGN (>)
 784                  Parse error. Switch to the data state. */
 785                  $this->state = 'data';
 786  
 787              } elseif ($this->char === $this->EOF) {
 788                  /* EOF
 789                  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
 790                  SOLIDUS character token. Reconsume the EOF character in the data state. */
 791                  $this->emitToken(
 792                      array(
 793                          'type' => self::CHARACTR,
 794                          'data' => '</'
 795                      )
 796                  );
 797  
 798                  $this->char--;
 799                  $this->state = 'data';
 800  
 801              } else {
 802                  /* Parse error. Switch to the bogus comment state. */
 803                  $this->state = 'bogusComment';
 804              }
 805          }
 806      }
 807  
 808      private function tagNameState()
 809      {
 810          // Consume the next input character:
 811          $this->char++;
 812          $char = $this->character($this->char);
 813  
 814          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 815              /* U+0009 CHARACTER TABULATION
 816              U+000A LINE FEED (LF)
 817              U+000B LINE TABULATION
 818              U+000C FORM FEED (FF)
 819              U+0020 SPACE
 820              Switch to the before attribute name state. */
 821              $this->state = 'beforeAttributeName';
 822  
 823          } elseif ($char === '>') {
 824              /* U+003E GREATER-THAN SIGN (>)
 825              Emit the current tag token. Switch to the data state. */
 826              $this->emitToken($this->token);
 827              $this->state = 'data';
 828  
 829          } elseif ($this->char === $this->EOF) {
 830              /* EOF
 831              Parse error. Emit the current tag token. Reconsume the EOF
 832              character in the data state. */
 833              $this->emitToken($this->token);
 834  
 835              $this->char--;
 836              $this->state = 'data';
 837  
 838          } elseif ($char === '/') {
 839              /* U+002F SOLIDUS (/)
 840              Parse error unless this is a permitted slash. Switch to the before
 841              attribute name state. */
 842              $this->state = 'beforeAttributeName';
 843  
 844          } else {
 845              /* Anything else
 846              Append the current input character to the current tag token's tag name.
 847              Stay in the tag name state. */
 848              $this->token['name'] .= strtolower($char);
 849              $this->state = 'tagName';
 850          }
 851      }
 852  
 853      private function beforeAttributeNameState()
 854      {
 855          // Consume the next input character:
 856          $this->char++;
 857          $char = $this->character($this->char);
 858  
 859          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 860              /* U+0009 CHARACTER TABULATION
 861              U+000A LINE FEED (LF)
 862              U+000B LINE TABULATION
 863              U+000C FORM FEED (FF)
 864              U+0020 SPACE
 865              Stay in the before attribute name state. */
 866              $this->state = 'beforeAttributeName';
 867  
 868          } elseif ($char === '>') {
 869              /* U+003E GREATER-THAN SIGN (>)
 870              Emit the current tag token. Switch to the data state. */
 871              $this->emitToken($this->token);
 872              $this->state = 'data';
 873  
 874          } elseif ($char === '/') {
 875              /* U+002F SOLIDUS (/)
 876              Parse error unless this is a permitted slash. Stay in the before
 877              attribute name state. */
 878              $this->state = 'beforeAttributeName';
 879  
 880          } elseif ($this->char === $this->EOF) {
 881              /* EOF
 882              Parse error. Emit the current tag token. Reconsume the EOF
 883              character in the data state. */
 884              $this->emitToken($this->token);
 885  
 886              $this->char--;
 887              $this->state = 'data';
 888  
 889          } else {
 890              /* Anything else
 891              Start a new attribute in the current tag token. Set that attribute's
 892              name to the current input character, and its value to the empty string.
 893              Switch to the attribute name state. */
 894              $this->token['attr'][] = array(
 895                  'name' => strtolower($char),
 896                  'value' => null
 897              );
 898  
 899              $this->state = 'attributeName';
 900          }
 901      }
 902  
 903      private function attributeNameState()
 904      {
 905          // Consume the next input character:
 906          $this->char++;
 907          $char = $this->character($this->char);
 908  
 909          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 910              /* U+0009 CHARACTER TABULATION
 911              U+000A LINE FEED (LF)
 912              U+000B LINE TABULATION
 913              U+000C FORM FEED (FF)
 914              U+0020 SPACE
 915              Stay in the before attribute name state. */
 916              $this->state = 'afterAttributeName';
 917  
 918          } elseif ($char === '=') {
 919              /* U+003D EQUALS SIGN (=)
 920              Switch to the before attribute value state. */
 921              $this->state = 'beforeAttributeValue';
 922  
 923          } elseif ($char === '>') {
 924              /* U+003E GREATER-THAN SIGN (>)
 925              Emit the current tag token. Switch to the data state. */
 926              $this->emitToken($this->token);
 927              $this->state = 'data';
 928  
 929          } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
 930              /* U+002F SOLIDUS (/)
 931              Parse error unless this is a permitted slash. Switch to the before
 932              attribute name state. */
 933              $this->state = 'beforeAttributeName';
 934  
 935          } elseif ($this->char === $this->EOF) {
 936              /* EOF
 937              Parse error. Emit the current tag token. Reconsume the EOF
 938              character in the data state. */
 939              $this->emitToken($this->token);
 940  
 941              $this->char--;
 942              $this->state = 'data';
 943  
 944          } else {
 945              /* Anything else
 946              Append the current input character to the current attribute's name.
 947              Stay in the attribute name state. */
 948              $last = count($this->token['attr']) - 1;
 949              $this->token['attr'][$last]['name'] .= strtolower($char);
 950  
 951              $this->state = 'attributeName';
 952          }
 953      }
 954  
 955      private function afterAttributeNameState()
 956      {
 957          // Consume the next input character:
 958          $this->char++;
 959          $char = $this->character($this->char);
 960  
 961          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 962              /* U+0009 CHARACTER TABULATION
 963              U+000A LINE FEED (LF)
 964              U+000B LINE TABULATION
 965              U+000C FORM FEED (FF)
 966              U+0020 SPACE
 967              Stay in the after attribute name state. */
 968              $this->state = 'afterAttributeName';
 969  
 970          } elseif ($char === '=') {
 971              /* U+003D EQUALS SIGN (=)
 972              Switch to the before attribute value state. */
 973              $this->state = 'beforeAttributeValue';
 974  
 975          } elseif ($char === '>') {
 976              /* U+003E GREATER-THAN SIGN (>)
 977              Emit the current tag token. Switch to the data state. */
 978              $this->emitToken($this->token);
 979              $this->state = 'data';
 980  
 981          } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
 982              /* U+002F SOLIDUS (/)
 983              Parse error unless this is a permitted slash. Switch to the
 984              before attribute name state. */
 985              $this->state = 'beforeAttributeName';
 986  
 987          } elseif ($this->char === $this->EOF) {
 988              /* EOF
 989              Parse error. Emit the current tag token. Reconsume the EOF
 990              character in the data state. */
 991              $this->emitToken($this->token);
 992  
 993              $this->char--;
 994              $this->state = 'data';
 995  
 996          } else {
 997              /* Anything else
 998              Start a new attribute in the current tag token. Set that attribute's
 999              name to the current input character, and its value to the empty string.
1000              Switch to the attribute name state. */
1001              $this->token['attr'][] = array(
1002                  'name' => strtolower($char),
1003                  'value' => null
1004              );
1005  
1006              $this->state = 'attributeName';
1007          }
1008      }
1009  
1010      private function beforeAttributeValueState()
1011      {
1012          // Consume the next input character:
1013          $this->char++;
1014          $char = $this->character($this->char);
1015  
1016          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017              /* U+0009 CHARACTER TABULATION
1018              U+000A LINE FEED (LF)
1019              U+000B LINE TABULATION
1020              U+000C FORM FEED (FF)
1021              U+0020 SPACE
1022              Stay in the before attribute value state. */
1023              $this->state = 'beforeAttributeValue';
1024  
1025          } elseif ($char === '"') {
1026              /* U+0022 QUOTATION MARK (")
1027              Switch to the attribute value (double-quoted) state. */
1028              $this->state = 'attributeValueDoubleQuoted';
1029  
1030          } elseif ($char === '&') {
1031              /* U+0026 AMPERSAND (&)
1032              Switch to the attribute value (unquoted) state and reconsume
1033              this input character. */
1034              $this->char--;
1035              $this->state = 'attributeValueUnquoted';
1036  
1037          } elseif ($char === '\'') {
1038              /* U+0027 APOSTROPHE (')
1039              Switch to the attribute value (single-quoted) state. */
1040              $this->state = 'attributeValueSingleQuoted';
1041  
1042          } elseif ($char === '>') {
1043              /* U+003E GREATER-THAN SIGN (>)
1044              Emit the current tag token. Switch to the data state. */
1045              $this->emitToken($this->token);
1046              $this->state = 'data';
1047  
1048          } else {
1049              /* Anything else
1050              Append the current input character to the current attribute's value.
1051              Switch to the attribute value (unquoted) state. */
1052              $last = count($this->token['attr']) - 1;
1053              $this->token['attr'][$last]['value'] .= $char;
1054  
1055              $this->state = 'attributeValueUnquoted';
1056          }
1057      }
1058  
1059      private function attributeValueDoubleQuotedState()
1060      {
1061          // Consume the next input character:
1062          $this->char++;
1063          $char = $this->character($this->char);
1064  
1065          if ($char === '"') {
1066              /* U+0022 QUOTATION MARK (")
1067              Switch to the before attribute name state. */
1068              $this->state = 'beforeAttributeName';
1069  
1070          } elseif ($char === '&') {
1071              /* U+0026 AMPERSAND (&)
1072              Switch to the entity in attribute value state. */
1073              $this->entityInAttributeValueState('double');
1074  
1075          } elseif ($this->char === $this->EOF) {
1076              /* EOF
1077              Parse error. Emit the current tag token. Reconsume the character
1078              in the data state. */
1079              $this->emitToken($this->token);
1080  
1081              $this->char--;
1082              $this->state = 'data';
1083  
1084          } else {
1085              /* Anything else
1086              Append the current input character to the current attribute's value.
1087              Stay in the attribute value (double-quoted) state. */
1088              $last = count($this->token['attr']) - 1;
1089              $this->token['attr'][$last]['value'] .= $char;
1090  
1091              $this->state = 'attributeValueDoubleQuoted';
1092          }
1093      }
1094  
1095      private function attributeValueSingleQuotedState()
1096      {
1097          // Consume the next input character:
1098          $this->char++;
1099          $char = $this->character($this->char);
1100  
1101          if ($char === '\'') {
1102              /* U+0022 QUOTATION MARK (')
1103              Switch to the before attribute name state. */
1104              $this->state = 'beforeAttributeName';
1105  
1106          } elseif ($char === '&') {
1107              /* U+0026 AMPERSAND (&)
1108              Switch to the entity in attribute value state. */
1109              $this->entityInAttributeValueState('single');
1110  
1111          } elseif ($this->char === $this->EOF) {
1112              /* EOF
1113              Parse error. Emit the current tag token. Reconsume the character
1114              in the data state. */
1115              $this->emitToken($this->token);
1116  
1117              $this->char--;
1118              $this->state = 'data';
1119  
1120          } else {
1121              /* Anything else
1122              Append the current input character to the current attribute's value.
1123              Stay in the attribute value (single-quoted) state. */
1124              $last = count($this->token['attr']) - 1;
1125              $this->token['attr'][$last]['value'] .= $char;
1126  
1127              $this->state = 'attributeValueSingleQuoted';
1128          }
1129      }
1130  
1131      private function attributeValueUnquotedState()
1132      {
1133          // Consume the next input character:
1134          $this->char++;
1135          $char = $this->character($this->char);
1136  
1137          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138              /* U+0009 CHARACTER TABULATION
1139              U+000A LINE FEED (LF)
1140              U+000B LINE TABULATION
1141              U+000C FORM FEED (FF)
1142              U+0020 SPACE
1143              Switch to the before attribute name state. */
1144              $this->state = 'beforeAttributeName';
1145  
1146          } elseif ($char === '&') {
1147              /* U+0026 AMPERSAND (&)
1148              Switch to the entity in attribute value state. */
1149              $this->entityInAttributeValueState();
1150  
1151          } elseif ($char === '>') {
1152              /* U+003E GREATER-THAN SIGN (>)
1153              Emit the current tag token. Switch to the data state. */
1154              $this->emitToken($this->token);
1155              $this->state = 'data';
1156  
1157          } else {
1158              /* Anything else
1159              Append the current input character to the current attribute's value.
1160              Stay in the attribute value (unquoted) state. */
1161              $last = count($this->token['attr']) - 1;
1162              $this->token['attr'][$last]['value'] .= $char;
1163  
1164              $this->state = 'attributeValueUnquoted';
1165          }
1166      }
1167  
1168      private function entityInAttributeValueState()
1169      {
1170          // Attempt to consume an entity.
1171          $entity = $this->entity();
1172  
1173          // If nothing is returned, append a U+0026 AMPERSAND character to the
1174          // current attribute's value. Otherwise, emit the character token that
1175          // was returned.
1176          $char = (!$entity)
1177              ? '&'
1178              : $entity;
1179  
1180          $last = count($this->token['attr']) - 1;
1181          $this->token['attr'][$last]['value'] .= $char;
1182      }
1183  
1184      private function bogusCommentState()
1185      {
1186          /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187          character (>) or the end of the file (EOF), whichever comes first. Emit
1188          a comment token whose data is the concatenation of all the characters
1189          starting from and including the character that caused the state machine
1190          to switch into the bogus comment state, up to and including the last
1191          consumed character before the U+003E character, if any, or up to the
1192          end of the file otherwise. (If the comment was started by the end of
1193          the file (EOF), the token is empty.) */
1194          $data = $this->characters('^>', $this->char);
1195          $this->emitToken(
1196              array(
1197                  'data' => $data,
1198                  'type' => self::COMMENT
1199              )
1200          );
1201  
1202          $this->char += strlen($data);
1203  
1204          /* Switch to the data state. */
1205          $this->state = 'data';
1206  
1207          /* If the end of the file was reached, reconsume the EOF character. */
1208          if ($this->char === $this->EOF) {
1209              $this->char = $this->EOF - 1;
1210          }
1211      }
1212  
1213      private function markupDeclarationOpenState()
1214      {
1215          /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216          characters, consume those two characters, create a comment token whose
1217          data is the empty string, and switch to the comment state. */
1218          if ($this->character($this->char + 1, 2) === '--') {
1219              $this->char += 2;
1220              $this->state = 'comment';
1221              $this->token = array(
1222                  'data' => null,
1223                  'type' => self::COMMENT
1224              );
1225  
1226              /* Otherwise if the next seven chacacters are a case-insensitive match
1227              for the word "DOCTYPE", then consume those characters and switch to the
1228              DOCTYPE state. */
1229          } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230              $this->char += 7;
1231              $this->state = 'doctype';
1232  
1233              /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234              The next character that is consumed, if any, is the first character
1235              that will be in the comment. */
1236          } else {
1237              $this->char++;
1238              $this->state = 'bogusComment';
1239          }
1240      }
1241  
1242      private function commentState()
1243      {
1244          /* Consume the next input character: */
1245          $this->char++;
1246          $char = $this->char();
1247  
1248          /* U+002D HYPHEN-MINUS (-) */
1249          if ($char === '-') {
1250              /* Switch to the comment dash state  */
1251              $this->state = 'commentDash';
1252  
1253              /* EOF */
1254          } elseif ($this->char === $this->EOF) {
1255              /* Parse error. Emit the comment token. Reconsume the EOF character
1256              in the data state. */
1257              $this->emitToken($this->token);
1258              $this->char--;
1259              $this->state = 'data';
1260  
1261              /* Anything else */
1262          } else {
1263              /* Append the input character to the comment token's data. Stay in
1264              the comment state. */
1265              $this->token['data'] .= $char;
1266          }
1267      }
1268  
1269      private function commentDashState()
1270      {
1271          /* Consume the next input character: */
1272          $this->char++;
1273          $char = $this->char();
1274  
1275          /* U+002D HYPHEN-MINUS (-) */
1276          if ($char === '-') {
1277              /* Switch to the comment end state  */
1278              $this->state = 'commentEnd';
1279  
1280              /* EOF */
1281          } elseif ($this->char === $this->EOF) {
1282              /* Parse error. Emit the comment token. Reconsume the EOF character
1283              in the data state. */
1284              $this->emitToken($this->token);
1285              $this->char--;
1286              $this->state = 'data';
1287  
1288              /* Anything else */
1289          } else {
1290              /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291              character to the comment token's data. Switch to the comment state. */
1292              $this->token['data'] .= '-' . $char;
1293              $this->state = 'comment';
1294          }
1295      }
1296  
1297      private function commentEndState()
1298      {
1299          /* Consume the next input character: */
1300          $this->char++;
1301          $char = $this->char();
1302  
1303          if ($char === '>') {
1304              $this->emitToken($this->token);
1305              $this->state = 'data';
1306  
1307          } elseif ($char === '-') {
1308              $this->token['data'] .= '-';
1309  
1310          } elseif ($this->char === $this->EOF) {
1311              $this->emitToken($this->token);
1312              $this->char--;
1313              $this->state = 'data';
1314  
1315          } else {
1316              $this->token['data'] .= '--' . $char;
1317              $this->state = 'comment';
1318          }
1319      }
1320  
1321      private function doctypeState()
1322      {
1323          /* Consume the next input character: */
1324          $this->char++;
1325          $char = $this->char();
1326  
1327          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328              $this->state = 'beforeDoctypeName';
1329  
1330          } else {
1331              $this->char--;
1332              $this->state = 'beforeDoctypeName';
1333          }
1334      }
1335  
1336      private function beforeDoctypeNameState()
1337      {
1338          /* Consume the next input character: */
1339          $this->char++;
1340          $char = $this->char();
1341  
1342          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343              // Stay in the before DOCTYPE name state.
1344  
1345          } elseif (preg_match('/^[a-z]$/', $char)) {
1346              $this->token = array(
1347                  'name' => strtoupper($char),
1348                  'type' => self::DOCTYPE,
1349                  'error' => true
1350              );
1351  
1352              $this->state = 'doctypeName';
1353  
1354          } elseif ($char === '>') {
1355              $this->emitToken(
1356                  array(
1357                      'name' => null,
1358                      'type' => self::DOCTYPE,
1359                      'error' => true
1360                  )
1361              );
1362  
1363              $this->state = 'data';
1364  
1365          } elseif ($this->char === $this->EOF) {
1366              $this->emitToken(
1367                  array(
1368                      'name' => null,
1369                      'type' => self::DOCTYPE,
1370                      'error' => true
1371                  )
1372              );
1373  
1374              $this->char--;
1375              $this->state = 'data';
1376  
1377          } else {
1378              $this->token = array(
1379                  'name' => $char,
1380                  'type' => self::DOCTYPE,
1381                  'error' => true
1382              );
1383  
1384              $this->state = 'doctypeName';
1385          }
1386      }
1387  
1388      private function doctypeNameState()
1389      {
1390          /* Consume the next input character: */
1391          $this->char++;
1392          $char = $this->char();
1393  
1394          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395              $this->state = 'AfterDoctypeName';
1396  
1397          } elseif ($char === '>') {
1398              $this->emitToken($this->token);
1399              $this->state = 'data';
1400  
1401          } elseif (preg_match('/^[a-z]$/', $char)) {
1402              $this->token['name'] .= strtoupper($char);
1403  
1404          } elseif ($this->char === $this->EOF) {
1405              $this->emitToken($this->token);
1406              $this->char--;
1407              $this->state = 'data';
1408  
1409          } else {
1410              $this->token['name'] .= $char;
1411          }
1412  
1413          $this->token['error'] = ($this->token['name'] === 'HTML')
1414              ? false
1415              : true;
1416      }
1417  
1418      private function afterDoctypeNameState()
1419      {
1420          /* Consume the next input character: */
1421          $this->char++;
1422          $char = $this->char();
1423  
1424          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425              // Stay in the DOCTYPE name state.
1426  
1427          } elseif ($char === '>') {
1428              $this->emitToken($this->token);
1429              $this->state = 'data';
1430  
1431          } elseif ($this->char === $this->EOF) {
1432              $this->emitToken($this->token);
1433              $this->char--;
1434              $this->state = 'data';
1435  
1436          } else {
1437              $this->token['error'] = true;
1438              $this->state = 'bogusDoctype';
1439          }
1440      }
1441  
1442      private function bogusDoctypeState()
1443      {
1444          /* Consume the next input character: */
1445          $this->char++;
1446          $char = $this->char();
1447  
1448          if ($char === '>') {
1449              $this->emitToken($this->token);
1450              $this->state = 'data';
1451  
1452          } elseif ($this->char === $this->EOF) {
1453              $this->emitToken($this->token);
1454              $this->char--;
1455              $this->state = 'data';
1456  
1457          } else {
1458              // Stay in the bogus DOCTYPE state.
1459          }
1460      }
1461  
1462      private function entity()
1463      {
1464          $start = $this->char;
1465  
1466          // This section defines how to consume an entity. This definition is
1467          // used when parsing entities in text and in attributes.
1468  
1469          // The behaviour depends on the identity of the next character (the
1470          // one immediately after the U+0026 AMPERSAND character):
1471  
1472          switch ($this->character($this->char + 1)) {
1473              // U+0023 NUMBER SIGN (#)
1474              case '#':
1475  
1476                  // The behaviour further depends on the character after the
1477                  // U+0023 NUMBER SIGN:
1478                  switch ($this->character($this->char + 1)) {
1479                      // U+0078 LATIN SMALL LETTER X
1480                      // U+0058 LATIN CAPITAL LETTER X
1481                      case 'x':
1482                      case 'X':
1483                          // Follow the steps below, but using the range of
1484                          // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485                          // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486                          // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487                          // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488                          // words, 0-9, A-F, a-f).
1489                          $char = 1;
1490                          $char_class = '0-9A-Fa-f';
1491                          break;
1492  
1493                      // Anything else
1494                      default:
1495                          // Follow the steps below, but using the range of
1496                          // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497                          // NINE (i.e. just 0-9).
1498                          $char = 0;
1499                          $char_class = '0-9';
1500                          break;
1501                  }
1502  
1503                  // Consume as many characters as match the range of characters
1504                  // given above.
1505                  $this->char++;
1506                  $e_name = $this->characters($char_class, $this->char + $char + 1);
1507                  $entity = $this->character($start, $this->char);
1508                  $cond = strlen($e_name) > 0;
1509  
1510                  // The rest of the parsing happens below.
1511                  break;
1512  
1513              // Anything else
1514              default:
1515                  // Consume the maximum number of characters possible, with the
1516                  // consumed characters case-sensitively matching one of the
1517                  // identifiers in the first column of the entities table.
1518  
1519                  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1520                  $len = strlen($e_name);
1521  
1522                  for ($c = 1; $c <= $len; $c++) {
1523                      $id = substr($e_name, 0, $c);
1524                      $this->char++;
1525  
1526                      if (in_array($id, $this->entities)) {
1527                          if ($e_name[$c - 1] !== ';') {
1528                              if ($c < $len && $e_name[$c] == ';') {
1529                                  $this->char++; // consume extra semicolon
1530                              }
1531                          }
1532                          $entity = $id;
1533                          break;
1534                      }
1535                  }
1536  
1537                  $cond = isset($entity);
1538                  // The rest of the parsing happens below.
1539                  break;
1540          }
1541  
1542          if (!$cond) {
1543              // If no match can be made, then this is a parse error. No
1544              // characters are consumed, and nothing is returned.
1545              $this->char = $start;
1546              return false;
1547          }
1548  
1549          // Return a character token for the character corresponding to the
1550          // entity name (as given by the second column of the entities table).
1551          return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
1552      }
1553  
1554      private function emitToken($token)
1555      {
1556          $emit = $this->tree->emitToken($token);
1557  
1558          if (is_int($emit)) {
1559              $this->content_model = $emit;
1560  
1561          } elseif ($token['type'] === self::ENDTAG) {
1562              $this->content_model = self::PCDATA;
1563          }
1564      }
1565  
1566      private function EOF()
1567      {
1568          $this->state = null;
1569          $this->tree->emitToken(
1570              array(
1571                  'type' => self::EOF
1572              )
1573          );
1574      }
1575  }
1576  
1577  class HTML5TreeConstructer
1578  {
1579      public $stack = array();
1580  
1581      private $phase;
1582      private $mode;
1583      private $dom;
1584      private $foster_parent = null;
1585      private $a_formatting = array();
1586  
1587      private $head_pointer = null;
1588      private $form_pointer = null;
1589  
1590      private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1591      private $formatting = array(
1592          'a',
1593          'b',
1594          'big',
1595          'em',
1596          'font',
1597          'i',
1598          'nobr',
1599          's',
1600          'small',
1601          'strike',
1602          'strong',
1603          'tt',
1604          'u'
1605      );
1606      private $special = array(
1607          'address',
1608          'area',
1609          'base',
1610          'basefont',
1611          'bgsound',
1612          'blockquote',
1613          'body',
1614          'br',
1615          'center',
1616          'col',
1617          'colgroup',
1618          'dd',
1619          'dir',
1620          'div',
1621          'dl',
1622          'dt',
1623          'embed',
1624          'fieldset',
1625          'form',
1626          'frame',
1627          'frameset',
1628          'h1',
1629          'h2',
1630          'h3',
1631          'h4',
1632          'h5',
1633          'h6',
1634          'head',
1635          'hr',
1636          'iframe',
1637          'image',
1638          'img',
1639          'input',
1640          'isindex',
1641          'li',
1642          'link',
1643          'listing',
1644          'menu',
1645          'meta',
1646          'noembed',
1647          'noframes',
1648          'noscript',
1649          'ol',
1650          'optgroup',
1651          'option',
1652          'p',
1653          'param',
1654          'plaintext',
1655          'pre',
1656          'script',
1657          'select',
1658          'spacer',
1659          'style',
1660          'tbody',
1661          'textarea',
1662          'tfoot',
1663          'thead',
1664          'title',
1665          'tr',
1666          'ul',
1667          'wbr'
1668      );
1669  
1670      // The different phases.
1671      const INIT_PHASE = 0;
1672      const ROOT_PHASE = 1;
1673      const MAIN_PHASE = 2;
1674      const END_PHASE = 3;
1675  
1676      // The different insertion modes for the main phase.
1677      const BEFOR_HEAD = 0;
1678      const IN_HEAD = 1;
1679      const AFTER_HEAD = 2;
1680      const IN_BODY = 3;
1681      const IN_TABLE = 4;
1682      const IN_CAPTION = 5;
1683      const IN_CGROUP = 6;
1684      const IN_TBODY = 7;
1685      const IN_ROW = 8;
1686      const IN_CELL = 9;
1687      const IN_SELECT = 10;
1688      const AFTER_BODY = 11;
1689      const IN_FRAME = 12;
1690      const AFTR_FRAME = 13;
1691  
1692      // The different types of elements.
1693      const SPECIAL = 0;
1694      const SCOPING = 1;
1695      const FORMATTING = 2;
1696      const PHRASING = 3;
1697  
1698      const MARKER = 0;
1699  
1700      public function __construct()
1701      {
1702          $this->phase = self::INIT_PHASE;
1703          $this->mode = self::BEFOR_HEAD;
1704          $this->dom = new DOMDocument;
1705  
1706          $this->dom->encoding = 'UTF-8';
1707          $this->dom->preserveWhiteSpace = true;
1708          $this->dom->substituteEntities = true;
1709          $this->dom->strictErrorChecking = false;
1710      }
1711  
1712      // Process tag tokens
1713      public function emitToken($token)
1714      {
1715          switch ($this->phase) {
1716              case self::INIT_PHASE:
1717                  return $this->initPhase($token);
1718                  break;
1719              case self::ROOT_PHASE:
1720                  return $this->rootElementPhase($token);
1721                  break;
1722              case self::MAIN_PHASE:
1723                  return $this->mainPhase($token);
1724                  break;
1725              case self::END_PHASE :
1726                  return $this->trailingEndPhase($token);
1727                  break;
1728          }
1729      }
1730  
1731      private function initPhase($token)
1732      {
1733          /* Initially, the tree construction stage must handle each token
1734          emitted from the tokenisation stage as follows: */
1735  
1736          /* A DOCTYPE token that is marked as being in error
1737          A comment token
1738          A start tag token
1739          An end tag token
1740          A character token that is not one of one of U+0009 CHARACTER TABULATION,
1741              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1742              or U+0020 SPACE
1743          An end-of-file token */
1744          if ((isset($token['error']) && $token['error']) ||
1745              $token['type'] === HTML5::COMMENT ||
1746              $token['type'] === HTML5::STARTTAG ||
1747              $token['type'] === HTML5::ENDTAG ||
1748              $token['type'] === HTML5::EOF ||
1749              ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1750                  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1751          ) {
1752              /* This specification does not define how to handle this case. In
1753              particular, user agents may ignore the entirety of this specification
1754              altogether for such documents, and instead invoke special parse modes
1755              with a greater emphasis on backwards compatibility. */
1756  
1757              $this->phase = self::ROOT_PHASE;
1758              return $this->rootElementPhase($token);
1759  
1760              /* A DOCTYPE token marked as being correct */
1761          } elseif (isset($token['error']) && !$token['error']) {
1762              /* Append a DocumentType node to the Document  node, with the name
1763              attribute set to the name given in the DOCTYPE token (which will be
1764              "HTML"), and the other attributes specific to DocumentType objects
1765              set to null, empty lists, or the empty string as appropriate. */
1766              $doctype = new DOMDocumentType(null, null, 'HTML');
1767  
1768              /* Then, switch to the root element phase of the tree construction
1769              stage. */
1770              $this->phase = self::ROOT_PHASE;
1771  
1772              /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1773              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1774              or U+0020 SPACE */
1775          } elseif (isset($token['data']) && preg_match(
1776                  '/^[\t\n\x0b\x0c ]+$/',
1777                  $token['data']
1778              )
1779          ) {
1780              /* Append that character  to the Document node. */
1781              $text = $this->dom->createTextNode($token['data']);
1782              $this->dom->appendChild($text);
1783          }
1784      }
1785  
1786      private function rootElementPhase($token)
1787      {
1788          /* After the initial phase, as each token is emitted from the tokenisation
1789          stage, it must be processed as described in this section. */
1790  
1791          /* A DOCTYPE token */
1792          if ($token['type'] === HTML5::DOCTYPE) {
1793              // Parse error. Ignore the token.
1794  
1795              /* A comment token */
1796          } elseif ($token['type'] === HTML5::COMMENT) {
1797              /* Append a Comment node to the Document object with the data
1798              attribute set to the data given in the comment token. */
1799              $comment = $this->dom->createComment($token['data']);
1800              $this->dom->appendChild($comment);
1801  
1802              /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1803              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1804              or U+0020 SPACE */
1805          } elseif ($token['type'] === HTML5::CHARACTR &&
1806              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1807          ) {
1808              /* Append that character  to the Document node. */
1809              $text = $this->dom->createTextNode($token['data']);
1810              $this->dom->appendChild($text);
1811  
1812              /* A character token that is not one of U+0009 CHARACTER TABULATION,
1813                  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1814                  (FF), or U+0020 SPACE
1815              A start tag token
1816              An end tag token
1817              An end-of-file token */
1818          } elseif (($token['type'] === HTML5::CHARACTR &&
1819                  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1820              $token['type'] === HTML5::STARTTAG ||
1821              $token['type'] === HTML5::ENDTAG ||
1822              $token['type'] === HTML5::EOF
1823          ) {
1824              /* Create an HTMLElement node with the tag name html, in the HTML
1825              namespace. Append it to the Document object. Switch to the main
1826              phase and reprocess the current token. */
1827              $html = $this->dom->createElement('html');
1828              $this->dom->appendChild($html);
1829              $this->stack[] = $html;
1830  
1831              $this->phase = self::MAIN_PHASE;
1832              return $this->mainPhase($token);
1833          }
1834      }
1835  
1836      private function mainPhase($token)
1837      {
1838          /* Tokens in the main phase must be handled as follows: */
1839  
1840          /* A DOCTYPE token */
1841          if ($token['type'] === HTML5::DOCTYPE) {
1842              // Parse error. Ignore the token.
1843  
1844              /* A start tag token with the tag name "html" */
1845          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1846              /* If this start tag token was not the first start tag token, then
1847              it is a parse error. */
1848  
1849              /* For each attribute on the token, check to see if the attribute
1850              is already present on the top element of the stack of open elements.
1851              If it is not, add the attribute and its corresponding value to that
1852              element. */
1853              foreach ($token['attr'] as $attr) {
1854                  if (!$this->stack[0]->hasAttribute($attr['name'])) {
1855                      $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1856                  }
1857              }
1858  
1859              /* An end-of-file token */
1860          } elseif ($token['type'] === HTML5::EOF) {
1861              /* Generate implied end tags. */
1862              $this->generateImpliedEndTags();
1863  
1864              /* Anything else. */
1865          } else {
1866              /* Depends on the insertion mode: */
1867              switch ($this->mode) {
1868                  case self::BEFOR_HEAD:
1869                      return $this->beforeHead($token);
1870                      break;
1871                  case self::IN_HEAD:
1872                      return $this->inHead($token);
1873                      break;
1874                  case self::AFTER_HEAD:
1875                      return $this->afterHead($token);
1876                      break;
1877                  case self::IN_BODY:
1878                      return $this->inBody($token);
1879                      break;
1880                  case self::IN_TABLE:
1881                      return $this->inTable($token);
1882                      break;
1883                  case self::IN_CAPTION:
1884                      return $this->inCaption($token);
1885                      break;
1886                  case self::IN_CGROUP:
1887                      return $this->inColumnGroup($token);
1888                      break;
1889                  case self::IN_TBODY:
1890                      return $this->inTableBody($token);
1891                      break;
1892                  case self::IN_ROW:
1893                      return $this->inRow($token);
1894                      break;
1895                  case self::IN_CELL:
1896                      return $this->inCell($token);
1897                      break;
1898                  case self::IN_SELECT:
1899                      return $this->inSelect($token);
1900                      break;
1901                  case self::AFTER_BODY:
1902                      return $this->afterBody($token);
1903                      break;
1904                  case self::IN_FRAME:
1905                      return $this->inFrameset($token);
1906                      break;
1907                  case self::AFTR_FRAME:
1908                      return $this->afterFrameset($token);
1909                      break;
1910                  case self::END_PHASE:
1911                      return $this->trailingEndPhase($token);
1912                      break;
1913              }
1914          }
1915      }
1916  
1917      private function beforeHead($token)
1918      {
1919          /* Handle the token as follows: */
1920  
1921          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1922          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1923          or U+0020 SPACE */
1924          if ($token['type'] === HTML5::CHARACTR &&
1925              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1926          ) {
1927              /* Append the character to the current node. */
1928              $this->insertText($token['data']);
1929  
1930              /* A comment token */
1931          } elseif ($token['type'] === HTML5::COMMENT) {
1932              /* Append a Comment node to the current node with the data attribute
1933              set to the data given in the comment token. */
1934              $this->insertComment($token['data']);
1935  
1936              /* A start tag token with the tag name "head" */
1937          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1938              /* Create an element for the token, append the new element to the
1939              current node and push it onto the stack of open elements. */
1940              $element = $this->insertElement($token);
1941  
1942              /* Set the head element pointer to this new element node. */
1943              $this->head_pointer = $element;
1944  
1945              /* Change the insertion mode to "in head". */
1946              $this->mode = self::IN_HEAD;
1947  
1948              /* A start tag token whose tag name is one of: "base", "link", "meta",
1949              "script", "style", "title". Or an end tag with the tag name "html".
1950              Or a character token that is not one of U+0009 CHARACTER TABULATION,
1951              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1952              or U+0020 SPACE. Or any other start tag token */
1953          } elseif ($token['type'] === HTML5::STARTTAG ||
1954              ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1955              ($token['type'] === HTML5::CHARACTR && !preg_match(
1956                      '/^[\t\n\x0b\x0c ]$/',
1957                      $token['data']
1958                  ))
1959          ) {
1960              /* Act as if a start tag token with the tag name "head" and no
1961              attributes had been seen, then reprocess the current token. */
1962              $this->beforeHead(
1963                  array(
1964                      'name' => 'head',
1965                      'type' => HTML5::STARTTAG,
1966                      'attr' => array()
1967                  )
1968              );
1969  
1970              return $this->inHead($token);
1971  
1972              /* Any other end tag */
1973          } elseif ($token['type'] === HTML5::ENDTAG) {
1974              /* Parse error. Ignore the token. */
1975          }
1976      }
1977  
1978      private function inHead($token)
1979      {
1980          /* Handle the token as follows: */
1981  
1982          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1983          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1984          or U+0020 SPACE.
1985  
1986          THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1987          or script element, append the character to the current node regardless
1988          of its content. */
1989          if (($token['type'] === HTML5::CHARACTR &&
1990                  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1991                  $token['type'] === HTML5::CHARACTR && in_array(
1992                      end($this->stack)->nodeName,
1993                      array('title', 'style', 'script')
1994                  ))
1995          ) {
1996              /* Append the character to the current node. */
1997              $this->insertText($token['data']);
1998  
1999              /* A comment token */
2000          } elseif ($token['type'] === HTML5::COMMENT) {
2001              /* Append a Comment node to the current node with the data attribute
2002              set to the data given in the comment token. */
2003              $this->insertComment($token['data']);
2004  
2005          } elseif ($token['type'] === HTML5::ENDTAG &&
2006              in_array($token['name'], array('title', 'style', 'script'))
2007          ) {
2008              array_pop($this->stack);
2009              return HTML5::PCDATA;
2010  
2011              /* A start tag with the tag name "title" */
2012          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2013              /* Create an element for the token and append the new element to the
2014              node pointed to by the head element pointer, or, if that is null
2015              (innerHTML case), to the current node. */
2016              if ($this->head_pointer !== null) {
2017                  $element = $this->insertElement($token, false);
2018                  $this->head_pointer->appendChild($element);
2019  
2020              } else {
2021                  $element = $this->insertElement($token);
2022              }
2023  
2024              /* Switch the tokeniser's content model flag  to the RCDATA state. */
2025              return HTML5::RCDATA;
2026  
2027              /* A start tag with the tag name "style" */
2028          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2029              /* Create an element for the token and append the new element to the
2030              node pointed to by the head element pointer, or, if that is null
2031              (innerHTML case), to the current node. */
2032              if ($this->head_pointer !== null) {
2033                  $element = $this->insertElement($token, false);
2034                  $this->head_pointer->appendChild($element);
2035  
2036              } else {
2037                  $this->insertElement($token);
2038              }
2039  
2040              /* Switch the tokeniser's content model flag  to the CDATA state. */
2041              return HTML5::CDATA;
2042  
2043              /* A start tag with the tag name "script" */
2044          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2045              /* Create an element for the token. */
2046              $element = $this->insertElement($token, false);
2047              $this->head_pointer->appendChild($element);
2048  
2049              /* Switch the tokeniser's content model flag  to the CDATA state. */
2050              return HTML5::CDATA;
2051  
2052              /* A start tag with the tag name "base", "link", or "meta" */
2053          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2054                  $token['name'],
2055                  array('base', 'link', 'meta')
2056              )
2057          ) {
2058              /* Create an element for the token and append the new element to the
2059              node pointed to by the head element pointer, or, if that is null
2060              (innerHTML case), to the current node. */
2061              if ($this->head_pointer !== null) {
2062                  $element = $this->insertElement($token, false);
2063                  $this->head_pointer->appendChild($element);
2064                  array_pop($this->stack);
2065  
2066              } else {
2067                  $this->insertElement($token);
2068              }
2069  
2070              /* An end tag with the tag name "head" */
2071          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2072              /* If the current node is a head element, pop the current node off
2073              the stack of open elements. */
2074              if ($this->head_pointer->isSameNode(end($this->stack))) {
2075                  array_pop($this->stack);
2076  
2077                  /* Otherwise, this is a parse error. */
2078              } else {
2079                  // k
2080              }
2081  
2082              /* Change the insertion mode to "after head". */
2083              $this->mode = self::AFTER_HEAD;
2084  
2085              /* A start tag with the tag name "head" or an end tag except "html". */
2086          } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2087              ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2088          ) {
2089              // Parse error. Ignore the token.
2090  
2091              /* Anything else */
2092          } else {
2093              /* If the current node is a head element, act as if an end tag
2094              token with the tag name "head" had been seen. */
2095              if ($this->head_pointer->isSameNode(end($this->stack))) {
2096                  $this->inHead(
2097                      array(
2098                          'name' => 'head',
2099                          'type' => HTML5::ENDTAG
2100                      )
2101                  );
2102  
2103                  /* Otherwise, change the insertion mode to "after head". */
2104              } else {
2105                  $this->mode = self::AFTER_HEAD;
2106              }
2107  
2108              /* Then, reprocess the current token. */
2109              return $this->afterHead($token);
2110          }
2111      }
2112  
2113      private function afterHead($token)
2114      {
2115          /* Handle the token as follows: */
2116  
2117          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2118          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2119          or U+0020 SPACE */
2120          if ($token['type'] === HTML5::CHARACTR &&
2121              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2122          ) {
2123              /* Append the character to the current node. */
2124              $this->insertText($token['data']);
2125  
2126              /* A comment token */
2127          } elseif ($token['type'] === HTML5::COMMENT) {
2128              /* Append a Comment node to the current node with the data attribute
2129              set to the data given in the comment token. */
2130              $this->insertComment($token['data']);
2131  
2132              /* A start tag token with the tag name "body" */
2133          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2134              /* Insert a body element for the token. */
2135              $this->insertElement($token);
2136  
2137              /* Change the insertion mode to "in body". */
2138              $this->mode = self::IN_BODY;
2139  
2140              /* A start tag token with the tag name "frameset" */
2141          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2142              /* Insert a frameset element for the token. */
2143              $this->insertElement($token);
2144  
2145              /* Change the insertion mode to "in frameset". */
2146              $this->mode = self::IN_FRAME;
2147  
2148              /* A start tag token whose tag name is one of: "base", "link", "meta",
2149              "script", "style", "title" */
2150          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2151                  $token['name'],
2152                  array('base', 'link', 'meta', 'script', 'style', 'title')
2153              )
2154          ) {
2155              /* Parse error. Switch the insertion mode back to "in head" and
2156              reprocess the token. */
2157              $this->mode = self::IN_HEAD;
2158              return $this->inHead($token);
2159  
2160              /* Anything else */
2161          } else {
2162              /* Act as if a start tag token with the tag name "body" and no
2163              attributes had been seen, and then reprocess the current token. */
2164              $this->afterHead(
2165                  array(
2166                      'name' => 'body',
2167                      'type' => HTML5::STARTTAG,
2168                      'attr' => array()
2169                  )
2170              );
2171  
2172              return $this->inBody($token);
2173          }
2174      }
2175  
2176      private function inBody($token)
2177      {
2178          /* Handle the token as follows: */
2179  
2180          switch ($token['type']) {
2181              /* A character token */
2182              case HTML5::CHARACTR:
2183                  /* Reconstruct the active formatting elements, if any. */
2184                  $this->reconstructActiveFormattingElements();
2185  
2186                  /* Append the token's character to the current node. */
2187                  $this->insertText($token['data']);
2188                  break;
2189  
2190              /* A comment token */
2191              case HTML5::COMMENT:
2192                  /* Append a Comment node to the current node with the data
2193                  attribute set to the data given in the comment token. */
2194                  $this->insertComment($token['data']);
2195                  break;
2196  
2197              case HTML5::STARTTAG:
2198                  switch ($token['name']) {
2199                      /* A start tag token whose tag name is one of: "script",
2200                      "style" */
2201                      case 'script':
2202                      case 'style':
2203                          /* Process the token as if the insertion mode had been "in
2204                          head". */
2205                          return $this->inHead($token);
2206                          break;
2207  
2208                      /* A start tag token whose tag name is one of: "base", "link",
2209                      "meta", "title" */
2210                      case 'base':
2211                      case 'link':
2212                      case 'meta':
2213                      case 'title':
2214                          /* Parse error. Process the token as if the insertion mode
2215                          had    been "in head". */
2216                          return $this->inHead($token);
2217                          break;
2218  
2219                      /* A start tag token with the tag name "body" */
2220                      case 'body':
2221                          /* Parse error. If the second element on the stack of open
2222                          elements is not a body element, or, if the stack of open
2223                          elements has only one node on it, then ignore the token.
2224                          (innerHTML case) */
2225                          if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2226                              // Ignore
2227  
2228                              /* Otherwise, for each attribute on the token, check to see
2229                              if the attribute is already present on the body element (the
2230                              second element)    on the stack of open elements. If it is not,
2231                              add the attribute and its corresponding value to that
2232                              element. */
2233                          } else {
2234                              foreach ($token['attr'] as $attr) {
2235                                  if (!$this->stack[1]->hasAttribute($attr['name'])) {
2236                                      $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2237                                  }
2238                              }
2239                          }
2240                          break;
2241  
2242                      /* A start tag whose tag name is one of: "address",
2243                      "blockquote", "center", "dir", "div", "dl", "fieldset",
2244                      "listing", "menu", "ol", "p", "ul" */
2245                      case 'address':
2246                      case 'blockquote':
2247                      case 'center':
2248                      case 'dir':
2249                      case 'div':
2250                      case 'dl':
2251                      case 'fieldset':
2252                      case 'listing':
2253                      case 'menu':
2254                      case 'ol':
2255                      case 'p':
2256                      case 'ul':
2257                          /* If the stack of open elements has a p element in scope,
2258                          then act as if an end tag with the tag name p had been
2259                          seen. */
2260                          if ($this->elementInScope('p')) {
2261                              $this->emitToken(
2262                                  array(
2263                                      'name' => 'p',
2264                                      'type' => HTML5::ENDTAG
2265                                  )
2266                              );
2267                          }
2268  
2269                          /* Insert an HTML element for the token. */
2270                          $this->insertElement($token);
2271                          break;
2272  
2273                      /* A start tag whose tag name is "form" */
2274                      case 'form':
2275                          /* If the form element pointer is not null, ignore the
2276                          token with a parse error. */
2277                          if ($this->form_pointer !== null) {
2278                              // Ignore.
2279  
2280                              /* Otherwise: */
2281                          } else {
2282                              /* If the stack of open elements has a p element in
2283                              scope, then act as if an end tag with the tag name p
2284                              had been seen. */
2285                              if ($this->elementInScope('p')) {
2286                                  $this->emitToken(
2287                                      array(
2288                                          'name' => 'p',
2289                                          'type' => HTML5::ENDTAG
2290                                      )
2291                                  );
2292                              }
2293  
2294                              /* Insert an HTML element for the token, and set the
2295                              form element pointer to point to the element created. */
2296                              $element = $this->insertElement($token);
2297                              $this->form_pointer = $element;
2298                          }
2299                          break;
2300  
2301                      /* A start tag whose tag name is "li", "dd" or "dt" */
2302                      case 'li':
2303                      case 'dd':
2304                      case 'dt':
2305                          /* If the stack of open elements has a p  element in scope,
2306                          then act as if an end tag with the tag name p had been
2307                          seen. */
2308                          if ($this->elementInScope('p')) {
2309                              $this->emitToken(
2310                                  array(
2311                                      'name' => 'p',
2312                                      'type' => HTML5::ENDTAG
2313                                  )
2314                              );
2315                          }
2316  
2317                          $stack_length = count($this->stack) - 1;
2318  
2319                          for ($n = $stack_length; 0 <= $n; $n--) {
2320                              /* 1. Initialise node to be the current node (the
2321                              bottommost node of the stack). */
2322                              $stop = false;
2323                              $node = $this->stack[$n];
2324                              $cat = $this->getElementCategory($node->tagName);
2325  
2326                              /* 2. If node is an li, dd or dt element, then pop all
2327                              the    nodes from the current node up to node, including
2328                              node, then stop this algorithm. */
2329                              if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2330                                      && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2331                              ) {
2332                                  for ($x = $stack_length; $x >= $n; $x--) {
2333                                      array_pop($this->stack);
2334                                  }
2335  
2336                                  break;
2337                              }
2338  
2339                              /* 3. If node is not in the formatting category, and is
2340                              not    in the phrasing category, and is not an address or
2341                              div element, then stop this algorithm. */
2342                              if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2343                                  $node->tagName !== 'address' && $node->tagName !== 'div'
2344                              ) {
2345                                  break;
2346                              }
2347                          }
2348  
2349                          /* Finally, insert an HTML element with the same tag
2350                          name as the    token's. */
2351                          $this->insertElement($token);
2352                          break;
2353  
2354                      /* A start tag token whose tag name is "plaintext" */
2355                      case 'plaintext':
2356                          /* If the stack of open elements has a p  element in scope,
2357                          then act as if an end tag with the tag name p had been
2358                          seen. */
2359                          if ($this->elementInScope('p')) {
2360                              $this->emitToken(
2361                                  array(
2362                                      'name' => 'p',
2363                                      'type' => HTML5::ENDTAG
2364                                  )
2365                              );
2366                          }
2367  
2368                          /* Insert an HTML element for the token. */
2369                          $this->insertElement($token);
2370  
2371                          return HTML5::PLAINTEXT;
2372                          break;
2373  
2374                      /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2375                      "h5", "h6" */
2376                      case 'h1':
2377                      case 'h2':
2378                      case 'h3':
2379                      case 'h4':
2380                      case 'h5':
2381                      case 'h6':
2382                          /* If the stack of open elements has a p  element in scope,
2383                          then act as if an end tag with the tag name p had been seen. */
2384                          if ($this->elementInScope('p')) {
2385                              $this->emitToken(
2386                                  array(
2387                                      'name' => 'p',
2388                                      'type' => HTML5::ENDTAG
2389                                  )
2390                              );
2391                          }
2392  
2393                          /* If the stack of open elements has in scope an element whose
2394                          tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2395                          this is a parse error; pop elements from the stack until an
2396                          element with one of those tag names has been popped from the
2397                          stack. */
2398                          while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2399                              array_pop($this->stack);
2400                          }
2401  
2402                          /* Insert an HTML element for the token. */
2403                          $this->insertElement($token);
2404                          break;
2405  
2406                      /* A start tag whose tag name is "a" */
2407                      case 'a':
2408                          /* If the list of active formatting elements contains
2409                          an element whose tag name is "a" between the end of the
2410                          list and the last marker on the list (or the start of
2411                          the list if there is no marker on the list), then this
2412                          is a parse error; act as if an end tag with the tag name
2413                          "a" had been seen, then remove that element from the list
2414                          of active formatting elements and the stack of open
2415                          elements if the end tag didn't already remove it (it
2416                          might not have if the element is not in table scope). */
2417                          $leng = count($this->a_formatting);
2418  
2419                          for ($n = $leng - 1; $n >= 0; $n--) {
2420                              if ($this->a_formatting[$n] === self::MARKER) {
2421                                  break;
2422  
2423                              } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2424                                  $this->emitToken(
2425                                      array(
2426                                          'name' => 'a',
2427                                          'type' => HTML5::ENDTAG
2428                                      )
2429                                  );
2430                                  break;
2431                              }
2432                          }
2433  
2434                          /* Reconstruct the active formatting elements, if any. */
2435                          $this->reconstructActiveFormattingElements();
2436  
2437                          /* Insert an HTML element for the token. */
2438                          $el = $this->insertElement($token);
2439  
2440                          /* Add that element to the list of active formatting
2441                          elements. */
2442                          $this->a_formatting[] = $el;
2443                          break;
2444  
2445                      /* A start tag whose tag name is one of: "b", "big", "em", "font",
2446                      "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2447                      case 'b':
2448                      case 'big':
2449                      case 'em':
2450                      case 'font':
2451                      case 'i':
2452                      case 'nobr':
2453                      case 's':
2454                      case 'small':
2455                      case 'strike':
2456                      case 'strong':
2457                      case 'tt':
2458                      case 'u':
2459                          /* Reconstruct the active formatting elements, if any. */
2460                          $this->reconstructActiveFormattingElements();
2461  
2462                          /* Insert an HTML element for the token. */
2463                          $el = $this->insertElement($token);
2464  
2465                          /* Add that element to the list of active formatting
2466                          elements. */
2467                          $this->a_formatting[] = $el;
2468                          break;
2469  
2470                      /* A start tag token whose tag name is "button" */
2471                      case 'button':
2472                          /* If the stack of open elements has a button element in scope,
2473                          then this is a parse error; act as if an end tag with the tag
2474                          name "button" had been seen, then reprocess the token. (We don't
2475                          do that. Unnecessary.) */
2476                          if ($this->elementInScope('button')) {
2477                              $this->inBody(
2478                                  array(
2479                                      'name' => 'button',
2480                                      'type' => HTML5::ENDTAG
2481                                  )
2482                              );
2483                          }
2484  
2485                          /* Reconstruct the active formatting elements, if any. */
2486                          $this->reconstructActiveFormattingElements();
2487  
2488                          /* Insert an HTML element for the token. */
2489                          $this->insertElement($token);
2490  
2491                          /* Insert a marker at the end of the list of active
2492                          formatting elements. */
2493                          $this->a_formatting[] = self::MARKER;
2494                          break;
2495  
2496                      /* A start tag token whose tag name is one of: "marquee", "object" */
2497                      case 'marquee':
2498                      case 'object':
2499                          /* Reconstruct the active formatting elements, if any. */
2500                          $this->reconstructActiveFormattingElements();
2501  
2502                          /* Insert an HTML element for the token. */
2503                          $this->insertElement($token);
2504  
2505                          /* Insert a marker at the end of the list of active
2506                          formatting elements. */
2507                          $this->a_formatting[] = self::MARKER;
2508                          break;
2509  
2510                      /* A start tag token whose tag name is "xmp" */
2511                      case 'xmp':
2512                          /* Reconstruct the active formatting elements, if any. */
2513                          $this->reconstructActiveFormattingElements();
2514  
2515                          /* Insert an HTML element for the token. */
2516                          $this->insertElement($token);
2517  
2518                          /* Switch the content model flag to the CDATA state. */
2519                          return HTML5::CDATA;
2520                          break;
2521  
2522                      /* A start tag whose tag name is "table" */
2523                      case 'table':
2524                          /* If the stack of open elements has a p element in scope,
2525                          then act as if an end tag with the tag name p had been seen. */
2526                          if ($this->elementInScope('p')) {
2527                              $this->emitToken(
2528                                  array(
2529                                      'name' => 'p',
2530                                      'type' => HTML5::ENDTAG
2531                                  )
2532                              );
2533                          }
2534  
2535                          /* Insert an HTML element for the token. */
2536                          $this->insertElement($token);
2537  
2538                          /* Change the insertion mode to "in table". */
2539                          $this->mode = self::IN_TABLE;
2540                          break;
2541  
2542                      /* A start tag whose tag name is one of: "area", "basefont",
2543                      "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2544                      case 'area':
2545                      case 'basefont':
2546                      case 'bgsound':
2547                      case 'br':
2548                      case 'embed':
2549                      case 'img':
2550                      case 'param':
2551                      case 'spacer':
2552                      case 'wbr':
2553                          /* Reconstruct the active formatting elements, if any. */
2554                          $this->reconstructActiveFormattingElements();
2555  
2556                          /* Insert an HTML element for the token. */
2557                          $this->insertElement($token);
2558  
2559                          /* Immediately pop the current node off the stack of open elements. */
2560                          array_pop($this->stack);
2561                          break;
2562  
2563                      /* A start tag whose tag name is "hr" */
2564                      case 'hr':
2565                          /* If the stack of open elements has a p element in scope,
2566                          then act as if an end tag with the tag name p had been seen. */
2567                          if ($this->elementInScope('p')) {
2568                              $this->emitToken(
2569                                  array(
2570                                      'name' => 'p',
2571                                      'type' => HTML5::ENDTAG
2572                                  )
2573                              );
2574                          }
2575  
2576                          /* Insert an HTML element for the token. */
2577                          $this->insertElement($token);
2578  
2579                          /* Immediately pop the current node off the stack of open elements. */
2580                          array_pop($this->stack);
2581                          break;
2582  
2583                      /* A start tag whose tag name is "image" */
2584                      case 'image':
2585                          /* Parse error. Change the token's tag name to "img" and
2586                          reprocess it. (Don't ask.) */
2587                          $token['name'] = 'img';
2588                          return $this->inBody($token);
2589                          break;
2590  
2591                      /* A start tag whose tag name is "input" */
2592                      case 'input':
2593                          /* Reconstruct the active formatting elements, if any. */
2594                          $this->reconstructActiveFormattingElements();
2595  
2596                          /* Insert an input element for the token. */
2597                          $element = $this->insertElement($token, false);
2598  
2599                          /* If the form element pointer is not null, then associate the
2600                          input element with the form element pointed to by the form
2601                          element pointer. */
2602                          $this->form_pointer !== null
2603                              ? $this->form_pointer->appendChild($element)
2604                              : end($this->stack)->appendChild($element);
2605  
2606                          /* Pop that input element off the stack of open elements. */
2607                          array_pop($this->stack);
2608                          break;
2609  
2610                      /* A start tag whose tag name is "isindex" */
2611                      case 'isindex':
2612                          /* Parse error. */
2613                          // w/e
2614  
2615                          /* If the form element pointer is not null,
2616                          then ignore the token. */
2617                          if ($this->form_pointer === null) {
2618                              /* Act as if a start tag token with the tag name "form" had
2619                              been seen. */
2620                              $this->inBody(
2621                                  array(
2622                                      'name' => 'body',
2623                                      'type' => HTML5::STARTTAG,
2624                                      'attr' => array()
2625                                  )
2626                              );
2627  
2628                              /* Act as if a start tag token with the tag name "hr" had
2629                              been seen. */
2630                              $this->inBody(
2631                                  array(
2632                                      'name' => 'hr',
2633                                      'type' => HTML5::STARTTAG,
2634                                      'attr' => array()
2635                                  )
2636                              );
2637  
2638                              /* Act as if a start tag token with the tag name "p" had
2639                              been seen. */
2640                              $this->inBody(
2641                                  array(
2642                                      'name' => 'p',
2643                                      'type' => HTML5::STARTTAG,
2644                                      'attr' => array()
2645                                  )
2646                              );
2647  
2648                              /* Act as if a start tag token with the tag name "label"
2649                              had been seen. */
2650                              $this->inBody(
2651                                  array(
2652                                      'name' => 'label',
2653                                      'type' => HTML5::STARTTAG,
2654                                      'attr' => array()
2655                                  )
2656                              );
2657  
2658                              /* Act as if a stream of character tokens had been seen. */
2659                              $this->insertText(
2660                                  'This is a searchable index. ' .
2661                                  'Insert your search keywords here: '
2662                              );
2663  
2664                              /* Act as if a start tag token with the tag name "input"
2665                              had been seen, with all the attributes from the "isindex"
2666                              token, except with the "name" attribute set to the value
2667                              "isindex" (ignoring any explicit "name" attribute). */
2668                              $attr = $token['attr'];
2669                              $attr[] = array('name' => 'name', 'value' => 'isindex');
2670  
2671                              $this->inBody(
2672                                  array(
2673                                      'name' => 'input',
2674                                      'type' => HTML5::STARTTAG,
2675                                      'attr' => $attr
2676                                  )
2677                              );
2678  
2679                              /* Act as if a stream of character tokens had been seen
2680                              (see below for what they should say). */
2681                              $this->insertText(
2682                                  'This is a searchable index. ' .
2683                                  'Insert your search keywords here: '
2684                              );
2685  
2686                              /* Act as if an end tag token with the tag name "label"
2687                              had been seen. */
2688                              $this->inBody(
2689                                  array(
2690                                      'name' => 'label',
2691                                      'type' => HTML5::ENDTAG
2692                                  )
2693                              );
2694  
2695                              /* Act as if an end tag token with the tag name "p" had
2696                              been seen. */
2697                              $this->inBody(
2698                                  array(
2699                                      'name' => 'p',
2700                                      'type' => HTML5::ENDTAG
2701                                  )
2702                              );
2703  
2704                              /* Act as if a start tag token with the tag name "hr" had
2705                              been seen. */
2706                              $this->inBody(
2707                                  array(
2708                                      'name' => 'hr',
2709                                      'type' => HTML5::ENDTAG
2710                                  )
2711                              );
2712  
2713                              /* Act as if an end tag token with the tag name "form" had
2714                              been seen. */
2715                              $this->inBody(
2716                                  array(
2717                                      'name' => 'form',
2718                                      'type' => HTML5::ENDTAG
2719                                  )
2720                              );
2721                          }
2722                          break;
2723  
2724                      /* A start tag whose tag name is "textarea" */
2725                      case 'textarea':
2726                          $this->insertElement($token);
2727  
2728                          /* Switch the tokeniser's content model flag to the
2729                          RCDATA state. */
2730                          return HTML5::RCDATA;
2731                          break;
2732  
2733                      /* A start tag whose tag name is one of: "iframe", "noembed",
2734                      "noframes" */
2735                      case 'iframe':
2736                      case 'noembed':
2737                      case 'noframes':
2738                          $this->insertElement($token);
2739  
2740                          /* Switch the tokeniser's content model flag to the CDATA state. */
2741                          return HTML5::CDATA;
2742                          break;
2743  
2744                      /* A start tag whose tag name is "select" */
2745                      case 'select':
2746                          /* Reconstruct the active formatting elements, if any. */
2747                          $this->reconstructActiveFormattingElements();
2748  
2749                          /* Insert an HTML element for the token. */
2750                          $this->insertElement($token);
2751  
2752                          /* Change the insertion mode to "in select". */
2753                          $this->mode = self::IN_SELECT;
2754                          break;
2755  
2756                      /* A start or end tag whose tag name is one of: "caption", "col",
2757                      "colgroup", "frame", "frameset", "head", "option", "optgroup",
2758                      "tbody", "td", "tfoot", "th", "thead", "tr". */
2759                      case 'caption':
2760                      case 'col':
2761                      case 'colgroup':
2762                      case 'frame':
2763                      case 'frameset':
2764                      case 'head':
2765                      case 'option':
2766                      case 'optgroup':
2767                      case 'tbody':
2768                      case 'td':
2769                      case 'tfoot':
2770                      case 'th':
2771                      case 'thead':
2772                      case 'tr':
2773                          // Parse error. Ignore the token.
2774                          break;
2775  
2776                      /* A start or end tag whose tag name is one of: "event-source",
2777                      "section", "nav", "article", "aside", "header", "footer",
2778                      "datagrid", "command" */
2779                      case 'event-source':
2780                      case 'section':
2781                      case 'nav':
2782                      case 'article':
2783                      case 'aside':
2784                      case 'header':
2785                      case 'footer':
2786                      case 'datagrid':
2787                      case 'command':
2788                          // Work in progress!
2789                          break;
2790  
2791                      /* A start tag token not covered by the previous entries */
2792                      default:
2793                          /* Reconstruct the active formatting elements, if any. */
2794                          $this->reconstructActiveFormattingElements();
2795  
2796                          $this->insertElement($token, true, true);
2797                          break;
2798                  }
2799                  break;
2800  
2801              case HTML5::ENDTAG:
2802                  switch ($token['name']) {
2803                      /* An end tag with the tag name "body" */
2804                      case 'body':
2805                          /* If the second element in the stack of open elements is
2806                          not a body element, this is a parse error. Ignore the token.
2807                          (innerHTML case) */
2808                          if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2809                              // Ignore.
2810  
2811                              /* If the current node is not the body element, then this
2812                              is a parse error. */
2813                          } elseif (end($this->stack)->nodeName !== 'body') {
2814                              // Parse error.
2815                          }
2816  
2817                          /* Change the insertion mode to "after body". */
2818                          $this->mode = self::AFTER_BODY;
2819                          break;
2820  
2821                      /* An end tag with the tag name "html" */
2822                      case 'html':
2823                          /* Act as if an end tag with tag name "body" had been seen,
2824                          then, if that token wasn't ignored, reprocess the current
2825                          token. */
2826                          $this->inBody(
2827                              array(
2828                                  'name' => 'body',
2829                                  'type' => HTML5::ENDTAG
2830                              )
2831                          );
2832  
2833                          return $this->afterBody($token);
2834                          break;
2835  
2836                      /* An end tag whose tag name is one of: "address", "blockquote",
2837                      "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2838                      "ol", "pre", "ul" */
2839                      case 'address':
2840                      case 'blockquote':
2841                      case 'center':
2842                      case 'dir':
2843                      case 'div':
2844                      case 'dl':
2845                      case 'fieldset':
2846                      case 'listing':
2847                      case 'menu':
2848                      case 'ol':
2849                      case 'pre':
2850                      case 'ul':
2851                          /* If the stack of open elements has an element in scope
2852                          with the same tag name as that of the token, then generate
2853                          implied end tags. */
2854                          if ($this->elementInScope($token['name'])) {
2855                              $this->generateImpliedEndTags();
2856  
2857                              /* Now, if the current node is not an element with
2858                              the same tag name as that of the token, then this
2859                              is a parse error. */
2860                              // w/e
2861  
2862                              /* If the stack of open elements has an element in
2863                              scope with the same tag name as that of the token,
2864                              then pop elements from this stack until an element
2865                              with that tag name has been popped from the stack. */
2866                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2867                                  if ($this->stack[$n]->nodeName === $token['name']) {
2868                                      $n = -1;
2869                                  }
2870  
2871                                  array_pop($this->stack);
2872                              }
2873                          }
2874                          break;
2875  
2876                      /* An end tag whose tag name is "form" */
2877                      case 'form':
2878                          /* If the stack of open elements has an element in scope
2879                          with the same tag name as that of the token, then generate
2880                          implied    end tags. */
2881                          if ($this->elementInScope($token['name'])) {
2882                              $this->generateImpliedEndTags();
2883  
2884                          }
2885  
2886                          if (end($this->stack)->nodeName !== $token['name']) {
2887                              /* Now, if the current node is not an element with the
2888                              same tag name as that of the token, then this is a parse
2889                              error. */
2890                              // w/e
2891  
2892                          } else {
2893                              /* Otherwise, if the current node is an element with
2894                              the same tag name as that of the token pop that element
2895                              from the stack. */
2896                              array_pop($this->stack);
2897                          }
2898  
2899                          /* In any case, set the form element pointer to null. */
2900                          $this->form_pointer = null;
2901                          break;
2902  
2903                      /* An end tag whose tag name is "p" */
2904                      case 'p':
2905                          /* If the stack of open elements has a p element in scope,
2906                          then generate implied end tags, except for p elements. */
2907                          if ($this->elementInScope('p')) {
2908                              $this->generateImpliedEndTags(array('p'));
2909  
2910                              /* If the current node is not a p element, then this is
2911                              a parse error. */
2912                              // k
2913  
2914                              /* If the stack of open elements has a p element in
2915                              scope, then pop elements from this stack until the stack
2916                              no longer has a p element in scope. */
2917                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2918                                  if ($this->elementInScope('p')) {
2919                                      array_pop($this->stack);
2920  
2921                                  } else {
2922                                      break;
2923                                  }
2924                              }
2925                          }
2926                          break;
2927  
2928                      /* An end tag whose tag name is "dd", "dt", or "li" */
2929                      case 'dd':
2930                      case 'dt':
2931                      case 'li':
2932                          /* If the stack of open elements has an element in scope
2933                          whose tag name matches the tag name of the token, then
2934                          generate implied end tags, except for elements with the
2935                          same tag name as the token. */
2936                          if ($this->elementInScope($token['name'])) {
2937                              $this->generateImpliedEndTags(array($token['name']));
2938  
2939                              /* If the current node is not an element with the same
2940                              tag name as the token, then this is a parse error. */
2941                              // w/e
2942  
2943                              /* If the stack of open elements has an element in scope
2944                              whose tag name matches the tag name of the token, then
2945                              pop elements from this stack until an element with that
2946                              tag name has been popped from the stack. */
2947                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2948                                  if ($this->stack[$n]->nodeName === $token['name']) {
2949                                      $n = -1;
2950                                  }
2951  
2952                                  array_pop($this->stack);
2953                              }
2954                          }
2955                          break;
2956  
2957                      /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2958                      "h5", "h6" */
2959                      case 'h1':
2960                      case 'h2':
2961                      case 'h3':
2962                      case 'h4':
2963                      case 'h5':
2964                      case 'h6':
2965                          $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2966  
2967                          /* If the stack of open elements has in scope an element whose
2968                          tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2969                          generate implied end tags. */
2970                          if ($this->elementInScope($elements)) {
2971                              $this->generateImpliedEndTags();
2972  
2973                              /* Now, if the current node is not an element with the same
2974                              tag name as that of the token, then this is a parse error. */
2975                              // w/e
2976  
2977                              /* If the stack of open elements has in scope an element
2978                              whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2979                              "h6", then pop elements from the stack until an element
2980                              with one of those tag names has been popped from the stack. */
2981                              while ($this->elementInScope($elements)) {
2982                                  array_pop($this->stack);
2983                              }
2984                          }
2985                          break;
2986  
2987                      /* An end tag whose tag name is one of: "a", "b", "big", "em",
2988                      "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2989                      case 'a':
2990                      case 'b':
2991                      case 'big':
2992                      case 'em':
2993                      case 'font':
2994                      case 'i':
2995                      case 'nobr':
2996                      case 's':
2997                      case 'small':
2998                      case 'strike':
2999                      case 'strong':
3000                      case 'tt':
3001                      case 'u':
3002                          /* 1. Let the formatting element be the last element in
3003                          the list of active formatting elements that:
3004                              * is between the end of the list and the last scope
3005                              marker in the list, if any, or the start of the list
3006                              otherwise, and
3007                              * has the same tag name as the token.
3008                          */
3009                          while (true) {
3010                              for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3011                                  if ($this->a_formatting[$a] === self::MARKER) {
3012                                      break;
3013  
3014                                  } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3015                                      $formatting_element = $this->a_formatting[$a];
3016                                      $in_stack = in_array($formatting_element, $this->stack, true);
3017                                      $fe_af_pos = $a;
3018                                      break;
3019                                  }
3020                              }
3021  
3022                              /* If there is no such node, or, if that node is
3023                              also in the stack of open elements but the element
3024                              is not in scope, then this is a parse error. Abort
3025                              these steps. The token is ignored. */
3026                              if (!isset($formatting_element) || ($in_stack &&
3027                                      !$this->elementInScope($token['name']))
3028                              ) {
3029                                  break;
3030  
3031                                  /* Otherwise, if there is such a node, but that node
3032                                  is not in the stack of open elements, then this is a
3033                                  parse error; remove the element from the list, and
3034                                  abort these steps. */
3035                              } elseif (isset($formatting_element) && !$in_stack) {
3036                                  unset($this->a_formatting[$fe_af_pos]);
3037                                  $this->a_formatting = array_merge($this->a_formatting);
3038                                  break;
3039                              }
3040  
3041                              /* 2. Let the furthest block be the topmost node in the
3042                              stack of open elements that is lower in the stack
3043                              than the formatting element, and is not an element in
3044                              the phrasing or formatting categories. There might
3045                              not be one. */
3046                              $fe_s_pos = array_search($formatting_element, $this->stack, true);
3047                              $length = count($this->stack);
3048  
3049                              for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3050                                  $category = $this->getElementCategory($this->stack[$s]->nodeName);
3051  
3052                                  if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3053                                      $furthest_block = $this->stack[$s];
3054                                  }
3055                              }
3056  
3057                              /* 3. If there is no furthest block, then the UA must
3058                              skip the subsequent steps and instead just pop all
3059                              the nodes from the bottom of the stack of open
3060                              elements, from the current node up to the formatting
3061                              element, and remove the formatting element from the
3062                              list of active formatting elements. */
3063                              if (!isset($furthest_block)) {
3064                                  for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3065                                      array_pop($this->stack);
3066                                  }
3067  
3068                                  unset($this->a_formatting[$fe_af_pos]);
3069                                  $this->a_formatting = array_merge($this->a_formatting);
3070                                  break;
3071                              }
3072  
3073                              /* 4. Let the common ancestor be the element
3074                              immediately above the formatting element in the stack
3075                              of open elements. */
3076                              $common_ancestor = $this->stack[$fe_s_pos - 1];
3077  
3078                              /* 5. If the furthest block has a parent node, then
3079                              remove the furthest block from its parent node. */
3080                              if ($furthest_block->parentNode !== null) {
3081                                  $furthest_block->parentNode->removeChild($furthest_block);
3082                              }
3083  
3084                              /* 6. Let a bookmark note the position of the
3085                              formatting element in the list of active formatting
3086                              elements relative to the elements on either side
3087                              of it in the list. */
3088                              $bookmark = $fe_af_pos;
3089  
3090                              /* 7. Let node and last node  be the furthest block.
3091                              Follow these steps: */
3092                              $node = $furthest_block;
3093                              $last_node = $furthest_block;
3094  
3095                              while (true) {
3096                                  for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3097                                      /* 7.1 Let node be the element immediately
3098                                      prior to node in the stack of open elements. */
3099                                      $node = $this->stack[$n];
3100  
3101                                      /* 7.2 If node is not in the list of active
3102                                      formatting elements, then remove node from
3103                                      the stack of open elements and then go back
3104                                      to step 1. */
3105                                      if (!in_array($node, $this->a_formatting, true)) {
3106                                          unset($this->stack[$n]);
3107                                          $this->stack = array_merge($this->stack);
3108  
3109                                      } else {
3110                                          break;
3111                                      }
3112                                  }
3113  
3114                                  /* 7.3 Otherwise, if node is the formatting
3115                                  element, then go to the next step in the overall
3116                                  algorithm. */
3117                                  if ($node === $formatting_element) {
3118                                      break;
3119  
3120                                      /* 7.4 Otherwise, if last node is the furthest
3121                                      block, then move the aforementioned bookmark to
3122                                      be immediately after the node in the list of
3123                                      active formatting elements. */
3124                                  } elseif ($last_node === $furthest_block) {
3125                                      $bookmark = array_search($node, $this->a_formatting, true) + 1;
3126                                  }
3127  
3128                                  /* 7.5 If node has any children, perform a
3129                                  shallow clone of node, replace the entry for
3130                                  node in the list of active formatting elements
3131                                  with an entry for the clone, replace the entry
3132                                  for node in the stack of open elements with an
3133                                  entry for the clone, and let node be the clone. */
3134                                  if ($node->hasChildNodes()) {
3135                                      $clone = $node->cloneNode();
3136                                      $s_pos = array_search($node, $this->stack, true);
3137                                      $a_pos = array_search($node, $this->a_formatting, true);
3138  
3139                                      $this->stack[$s_pos] = $clone;
3140                                      $this->a_formatting[$a_pos] = $clone;
3141                                      $node = $clone;
3142                                  }
3143  
3144                                  /* 7.6 Insert last node into node, first removing
3145                                  it from its previous parent node if any. */
3146                                  if ($last_node->parentNode !== null) {
3147                                      $last_node->parentNode->removeChild($last_node);
3148                                  }
3149  
3150                                  $node->appendChild($last_node);
3151  
3152                                  /* 7.7 Let last node be node. */
3153                                  $last_node = $node;
3154                              }
3155  
3156                              /* 8. Insert whatever last node ended up being in
3157                              the previous step into the common ancestor node,
3158                              first removing it from its previous parent node if
3159                              any. */
3160                              if ($last_node->parentNode !== null) {
3161                                  $last_node->parentNode->removeChild($last_node);
3162                              }
3163  
3164                              $common_ancestor->appendChild($last_node);
3165  
3166                              /* 9. Perform a shallow clone of the formatting
3167                              element. */
3168                              $clone = $formatting_element->cloneNode();
3169  
3170                              /* 10. Take all of the child nodes of the furthest
3171                              block and append them to the clone created in the
3172                              last step. */
3173                              while ($furthest_block->hasChildNodes()) {
3174                                  $child = $furthest_block->firstChild;
3175                                  $furthest_block->removeChild($child);
3176                                  $clone->appendChild($child);
3177                              }
3178  
3179                              /* 11. Append that clone to the furthest block. */
3180                              $furthest_block->appendChild($clone);
3181  
3182                              /* 12. Remove the formatting element from the list
3183                              of active formatting elements, and insert the clone
3184                              into the list of active formatting elements at the
3185                              position of the aforementioned bookmark. */
3186                              $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3187                              unset($this->a_formatting[$fe_af_pos]);
3188                              $this->a_formatting = array_merge($this->a_formatting);
3189  
3190                              $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3191                              $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3192                              $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3193  
3194                              /* 13. Remove the formatting element from the stack
3195                              of open elements, and insert the clone into the stack
3196                              of open elements immediately after (i.e. in a more
3197                              deeply nested position than) the position of the
3198                              furthest block in that stack. */
3199                              $fe_s_pos = array_search($formatting_element, $this->stack, true);
3200                              $fb_s_pos = array_search($furthest_block, $this->stack, true);
3201                              unset($this->stack[$fe_s_pos]);
3202  
3203                              $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3204                              $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3205                              $this->stack = array_merge($s_part1, array($clone), $s_part2);
3206  
3207                              /* 14. Jump back to step 1 in this series of steps. */
3208                              unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3209                          }
3210                          break;
3211  
3212                      /* An end tag token whose tag name is one of: "button",
3213                      "marquee", "object" */
3214                      case 'button':
3215                      case 'marquee':
3216                      case 'object':
3217                          /* If the stack of open elements has an element in scope whose
3218                          tag name matches the tag name of the token, then generate implied
3219                          tags. */
3220                          if ($this->elementInScope($token['name'])) {
3221                              $this->generateImpliedEndTags();
3222  
3223                              /* Now, if the current node is not an element with the same
3224                              tag name as the token, then this is a parse error. */
3225                              // k
3226  
3227                              /* Now, if the stack of open elements has an element in scope
3228                              whose tag name matches the tag name of the token, then pop
3229                              elements from the stack until that element has been popped from
3230                              the stack, and clear the list of active formatting elements up
3231                              to the last marker. */
3232                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3233                                  if ($this->stack[$n]->nodeName === $token['name']) {
3234                                      $n = -1;
3235                                  }
3236  
3237                                  array_pop($this->stack);
3238                              }
3239  
3240                              $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3241  
3242                              for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3243                                  array_pop($this->a_formatting);
3244                              }
3245                          }
3246                          break;
3247  
3248                      /* Or an end tag whose tag name is one of: "area", "basefont",
3249                      "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3250                      "input", "isindex", "noembed", "noframes", "param", "select",
3251                      "spacer", "table", "textarea", "wbr" */
3252                      case 'area':
3253                      case 'basefont':
3254                      case 'bgsound':
3255                      case 'br':
3256                      case 'embed':
3257                      case 'hr':
3258                      case 'iframe':
3259                      case 'image':
3260                      case 'img':
3261                      case 'input':
3262                      case 'isindex':
3263                      case 'noembed':
3264                      case 'noframes':
3265                      case 'param':
3266                      case 'select':
3267                      case 'spacer':
3268                      case 'table':
3269                      case 'textarea':
3270                      case 'wbr':
3271                          // Parse error. Ignore the token.
3272                          break;
3273  
3274                      /* An end tag token not covered by the previous entries */
3275                      default:
3276                          for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3277                              /* Initialise node to be the current node (the bottommost
3278                              node of the stack). */
3279                              $node = end($this->stack);
3280  
3281                              /* If node has the same tag name as the end tag token,
3282                              then: */
3283                              if ($token['name'] === $node->nodeName) {
3284                                  /* Generate implied end tags. */
3285                                  $this->generateImpliedEndTags();
3286  
3287                                  /* If the tag name of the end tag token does not
3288                                  match the tag name of the current node, this is a
3289                                  parse error. */
3290                                  // k
3291  
3292                                  /* Pop all the nodes from the current node up to
3293                                  node, including node, then stop this algorithm. */
3294                                  for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3295                                      array_pop($this->stack);
3296                                  }
3297  
3298                              } else {
3299                                  $category = $this->getElementCategory($node);
3300  
3301                                  if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3302                                      /* Otherwise, if node is in neither the formatting
3303                                      category nor the phrasing category, then this is a
3304                                      parse error. Stop this algorithm. The end tag token
3305                                      is ignored. */
3306                                      return false;
3307                                  }
3308                              }
3309                          }
3310                          break;
3311                  }
3312                  break;
3313          }
3314      }
3315  
3316      private function inTable($token)
3317      {
3318          $clear = array('html', 'table');
3319  
3320          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3321          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3322          or U+0020 SPACE */
3323          if ($token['type'] === HTML5::CHARACTR &&
3324              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3325          ) {
3326              /* Append the character to the current node. */
3327              $text = $this->dom->createTextNode($token['data']);
3328              end($this->stack)->appendChild($text);
3329  
3330              /* A comment token */
3331          } elseif ($token['type'] === HTML5::COMMENT) {
3332              /* Append a Comment node to the current node with the data
3333              attribute set to the data given in the comment token. */
3334              $comment = $this->dom->createComment($token['data']);
3335              end($this->stack)->appendChild($comment);
3336  
3337              /* A start tag whose tag name is "caption" */
3338          } elseif ($token['type'] === HTML5::STARTTAG &&
3339              $token['name'] === 'caption'
3340          ) {
3341              /* Clear the stack back to a table context. */
3342              $this->clearStackToTableContext($clear);
3343  
3344              /* Insert a marker at the end of the list of active
3345              formatting elements. */
3346              $this->a_formatting[] = self::MARKER;
3347  
3348              /* Insert an HTML element for the token, then switch the
3349              insertion mode to "in caption". */
3350              $this->insertElement($token);
3351              $this->mode = self::IN_CAPTION;
3352  
3353              /* A start tag whose tag name is "colgroup" */
3354          } elseif ($token['type'] === HTML5::STARTTAG &&
3355              $token['name'] === 'colgroup'
3356          ) {
3357              /* Clear the stack back to a table context. */
3358              $this->clearStackToTableContext($clear);
3359  
3360              /* Insert an HTML element for the token, then switch the
3361              insertion mode to "in column group". */
3362              $this->insertElement($token);
3363              $this->mode = self::IN_CGROUP;
3364  
3365              /* A start tag whose tag name is "col" */
3366          } elseif ($token['type'] === HTML5::STARTTAG &&
3367              $token['name'] === 'col'
3368          ) {
3369              $this->inTable(
3370                  array(
3371                      'name' => 'colgroup',
3372                      'type' => HTML5::STARTTAG,
3373                      'attr' => array()
3374                  )
3375              );
3376  
3377              $this->inColumnGroup($token);
3378  
3379              /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3380          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3381                  $token['name'],
3382                  array('tbody', 'tfoot', 'thead')
3383              )
3384          ) {
3385              /* Clear the stack back to a table context. */
3386              $this->clearStackToTableContext($clear);
3387  
3388              /* Insert an HTML element for the token, then switch the insertion
3389              mode to "in table body". */
3390              $this->insertElement($token);
3391              $this->mode = self::IN_TBODY;
3392  
3393              /* A start tag whose tag name is one of: "td", "th", "tr" */
3394          } elseif ($token['type'] === HTML5::STARTTAG &&
3395              in_array($token['name'], array('td', 'th', 'tr'))
3396          ) {
3397              /* Act as if a start tag token with the tag name "tbody" had been
3398              seen, then reprocess the current token. */
3399              $this->inTable(
3400                  array(
3401                      'name' => 'tbody',
3402                      'type' => HTML5::STARTTAG,
3403                      'attr' => array()
3404                  )
3405              );
3406  
3407              return $this->inTableBody($token);
3408  
3409              /* A start tag whose tag name is "table" */
3410          } elseif ($token['type'] === HTML5::STARTTAG &&
3411              $token['name'] === 'table'
3412          ) {
3413              /* Parse error. Act as if an end tag token with the tag name "table"
3414              had been seen, then, if that token wasn't ignored, reprocess the
3415              current token. */
3416              $this->inTable(
3417                  array(
3418                      'name' => 'table',
3419                      'type' => HTML5::ENDTAG
3420                  )
3421              );
3422  
3423              return $this->mainPhase($token);
3424  
3425              /* An end tag whose tag name is "table" */
3426          } elseif ($token['type'] === HTML5::ENDTAG &&
3427              $token['name'] === 'table'
3428          ) {
3429              /* If the stack of open elements does not have an element in table
3430              scope with the same tag name as the token, this is a parse error.
3431              Ignore the token. (innerHTML case) */
3432              if (!$this->elementInScope($token['name'], true)) {
3433                  return false;
3434  
3435                  /* Otherwise: */
3436              } else {
3437                  /* Generate implied end tags. */
3438                  $this->generateImpliedEndTags();
3439  
3440                  /* Now, if the current node is not a table element, then this
3441                  is a parse error. */
3442                  // w/e
3443  
3444                  /* Pop elements from this stack until a table element has been
3445                  popped from the stack. */
3446                  while (true) {
3447                      $current = end($this->stack)->nodeName;
3448                      array_pop($this->stack);
3449  
3450                      if ($current === 'table') {
3451                          break;
3452                      }
3453                  }
3454  
3455                  /* Reset the insertion mode appropriately. */
3456                  $this->resetInsertionMode();
3457              }
3458  
3459              /* An end tag whose tag name is one of: "body", "caption", "col",
3460              "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3461          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3462                  $token['name'],
3463                  array(
3464                      'body',
3465                      'caption',
3466                      'col',
3467                      'colgroup',
3468                      'html',
3469                      'tbody',
3470                      'td',
3471                      'tfoot',
3472                      'th',
3473                      'thead',
3474                      'tr'
3475                  )
3476              )
3477          ) {
3478              // Parse error. Ignore the token.
3479  
3480              /* Anything else */
3481          } else {
3482              /* Parse error. Process the token as if the insertion mode was "in
3483              body", with the following exception: */
3484  
3485              /* If the current node is a table, tbody, tfoot, thead, or tr
3486              element, then, whenever a node would be inserted into the current
3487              node, it must instead be inserted into the foster parent element. */
3488              if (in_array(
3489                  end($this->stack)->nodeName,
3490                  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3491              )
3492              ) {
3493                  /* The foster parent element is the parent element of the last
3494                  table element in the stack of open elements, if there is a
3495                  table element and it has such a parent element. If there is no
3496                  table element in the stack of open elements (innerHTML case),
3497                  then the foster parent element is the first element in the
3498                  stack of open elements (the html  element). Otherwise, if there
3499                  is a table element in the stack of open elements, but the last
3500                  table element in the stack of open elements has no parent, or
3501                  its parent node is not an element, then the foster parent
3502                  element is the element before the last table element in the
3503                  stack of open elements. */
3504                  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3505                      if ($this->stack[$n]->nodeName === 'table') {
3506                          $table = $this->stack[$n];
3507                          break;
3508                      }
3509                  }
3510  
3511                  if (isset($table) && $table->parentNode !== null) {
3512                      $this->foster_parent = $table->parentNode;
3513  
3514                  } elseif (!isset($table)) {
3515                      $this->foster_parent = $this->stack[0];
3516  
3517                  } elseif (isset($table) && ($table->parentNode === null ||
3518                          $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3519                  ) {
3520                      $this->foster_parent = $this->stack[$n - 1];
3521                  }
3522              }
3523  
3524              $this->inBody($token);
3525          }
3526      }
3527  
3528      private function inCaption($token)
3529      {
3530          /* An end tag whose tag name is "caption" */
3531          if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3532              /* If the stack of open elements does not have an element in table
3533              scope with the same tag name as the token, this is a parse error.
3534              Ignore the token. (innerHTML case) */
3535              if (!$this->elementInScope($token['name'], true)) {
3536                  // Ignore
3537  
3538                  /* Otherwise: */
3539              } else {
3540                  /* Generate implied end tags. */
3541                  $this->generateImpliedEndTags();
3542  
3543                  /* Now, if the current node is not a caption element, then this
3544                  is a parse error. */
3545                  // w/e
3546  
3547                  /* Pop elements from this stack until a caption element has
3548                  been popped from the stack. */
3549                  while (true) {
3550                      $node = end($this->stack)->nodeName;
3551                      array_pop($this->stack);
3552  
3553                      if ($node === 'caption') {
3554                          break;
3555                      }
3556                  }
3557  
3558                  /* Clear the list of active formatting elements up to the last
3559                  marker. */
3560                  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3561  
3562                  /* Switch the insertion mode to "in table". */
3563                  $this->mode = self::IN_TABLE;
3564              }
3565  
3566              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3567              "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3568              name is "table" */
3569          } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3570                      $token['name'],
3571                      array(
3572                          'caption',
3573                          'col',
3574                          'colgroup',
3575                          'tbody',
3576                          'td',
3577                          'tfoot',
3578                          'th',
3579                          'thead',
3580                          'tr'
3581                      )
3582                  )) || ($token['type'] === HTML5::ENDTAG &&
3583                  $token['name'] === 'table')
3584          ) {
3585              /* Parse error. Act as if an end tag with the tag name "caption"
3586              had been seen, then, if that token wasn't ignored, reprocess the
3587              current token. */
3588              $this->inCaption(
3589                  array(
3590                      'name' => 'caption',
3591                      'type' => HTML5::ENDTAG
3592                  )
3593              );
3594  
3595              return $this->inTable($token);
3596  
3597              /* An end tag whose tag name is one of: "body", "col", "colgroup",
3598              "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3599          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3600                  $token['name'],
3601                  array(
3602                      'body',
3603                      'col',
3604                      'colgroup',
3605                      'html',
3606                      'tbody',
3607                      'tfoot',
3608                      'th',
3609                      'thead',
3610                      'tr'
3611                  )
3612              )
3613          ) {
3614              // Parse error. Ignore the token.
3615  
3616              /* Anything else */
3617          } else {
3618              /* Process the token as if the insertion mode was "in body". */
3619              $this->inBody($token);
3620          }
3621      }
3622  
3623      private function inColumnGroup($token)
3624      {
3625          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3626          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3627          or U+0020 SPACE */
3628          if ($token['type'] === HTML5::CHARACTR &&
3629              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3630          ) {
3631              /* Append the character to the current node. */
3632              $text = $this->dom->createTextNode($token['data']);
3633              end($this->stack)->appendChild($text);
3634  
3635              /* A comment token */
3636          } elseif ($token['type'] === HTML5::COMMENT) {
3637              /* Append a Comment node to the current node with the data
3638              attribute set to the data given in the comment token. */
3639              $comment = $this->dom->createComment($token['data']);
3640              end($this->stack)->appendChild($comment);
3641  
3642              /* A start tag whose tag name is "col" */
3643          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3644              /* Insert a col element for the token. Immediately pop the current
3645              node off the stack of open elements. */
3646              $this->insertElement($token);
3647              array_pop($this->stack);
3648  
3649              /* An end tag whose tag name is "colgroup" */
3650          } elseif ($token['type'] === HTML5::ENDTAG &&
3651              $token['name'] === 'colgroup'
3652          ) {
3653              /* If the current node is the root html element, then this is a
3654              parse error, ignore the token. (innerHTML case) */
3655              if (end($this->stack)->nodeName === 'html') {
3656                  // Ignore
3657  
3658                  /* Otherwise, pop the current node (which will be a colgroup
3659                  element) from the stack of open elements. Switch the insertion
3660                  mode to "in table". */
3661              } else {
3662                  array_pop($this->stack);
3663                  $this->mode = self::IN_TABLE;
3664              }
3665  
3666              /* An end tag whose tag name is "col" */
3667          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3668              /* Parse error. Ignore the token. */
3669  
3670              /* Anything else */
3671          } else {
3672              /* Act as if an end tag with the tag name "colgroup" had been seen,
3673              and then, if that token wasn't ignored, reprocess the current token. */
3674              $this->inColumnGroup(
3675                  array(
3676                      'name' => 'colgroup',
3677                      'type' => HTML5::ENDTAG
3678                  )
3679              );
3680  
3681              return $this->inTable($token);
3682          }
3683      }
3684  
3685      private function inTableBody($token)
3686      {
3687          $clear = array('tbody', 'tfoot', 'thead', 'html');
3688  
3689          /* A start tag whose tag name is "tr" */
3690          if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3691              /* Clear the stack back to a table body context. */
3692              $this->clearStackToTableContext($clear);
3693  
3694              /* Insert a tr element for the token, then switch the insertion
3695              mode to "in row". */
3696              $this->insertElement($token);
3697              $this->mode = self::IN_ROW;
3698  
3699              /* A start tag whose tag name is one of: "th", "td" */
3700          } elseif ($token['type'] === HTML5::STARTTAG &&
3701              ($token['name'] === 'th' || $token['name'] === 'td')
3702          ) {
3703              /* Parse error. Act as if a start tag with the tag name "tr" had
3704              been seen, then reprocess the current token. */
3705              $this->inTableBody(
3706                  array(
3707                      'name' => 'tr',
3708                      'type' => HTML5::STARTTAG,
3709                      'attr' => array()
3710                  )
3711              );
3712  
3713              return $this->inRow($token);
3714  
3715              /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3716          } elseif ($token['type'] === HTML5::ENDTAG &&
3717              in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3718          ) {
3719              /* If the stack of open elements does not have an element in table
3720              scope with the same tag name as the token, this is a parse error.
3721              Ignore the token. */
3722              if (!$this->elementInScope($token['name'], true)) {
3723                  // Ignore
3724  
3725                  /* Otherwise: */
3726              } else {
3727                  /* Clear the stack back to a table body context. */
3728                  $this->clearStackToTableContext($clear);
3729  
3730                  /* Pop the current node from the stack of open elements. Switch
3731                  the insertion mode to "in table". */
3732                  array_pop($this->stack);
3733                  $this->mode = self::IN_TABLE;
3734              }
3735  
3736              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3737              "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3738          } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3739                      $token['name'],
3740                      array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3741                  )) ||
3742              ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3743          ) {
3744              /* If the stack of open elements does not have a tbody, thead, or
3745              tfoot element in table scope, this is a parse error. Ignore the
3746              token. (innerHTML case) */
3747              if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3748                  // Ignore.
3749  
3750                  /* Otherwise: */
3751              } else {
3752                  /* Clear the stack back to a table body context. */
3753                  $this->clearStackToTableContext($clear);
3754  
3755                  /* Act as if an end tag with the same tag name as the current
3756                  node ("tbody", "tfoot", or "thead") had been seen, then
3757                  reprocess the current token. */
3758                  $this->inTableBody(
3759                      array(
3760                          'name' => end($this->stack)->nodeName,
3761                          'type' => HTML5::ENDTAG
3762                      )
3763                  );
3764  
3765                  return $this->mainPhase($token);
3766              }
3767  
3768              /* An end tag whose tag name is one of: "body", "caption", "col",
3769              "colgroup", "html", "td", "th", "tr" */
3770          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3771                  $token['name'],
3772                  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3773              )
3774          ) {
3775              /* Parse error. Ignore the token. */
3776  
3777              /* Anything else */
3778          } else {
3779              /* Process the token as if the insertion mode was "in table". */
3780              $this->inTable($token);
3781          }
3782      }
3783  
3784      private function inRow($token)
3785      {
3786          $clear = array('tr', 'html');
3787  
3788          /* A start tag whose tag name is one of: "th", "td" */
3789          if ($token['type'] === HTML5::STARTTAG &&
3790              ($token['name'] === 'th' || $token['name'] === 'td')
3791          ) {
3792              /* Clear the stack back to a table row context. */
3793              $this->clearStackToTableContext($clear);
3794  
3795              /* Insert an HTML element for the token, then switch the insertion
3796              mode to "in cell". */
3797              $this->insertElement($token);
3798              $this->mode = self::IN_CELL;
3799  
3800              /* Insert a marker at the end of the list of active formatting
3801              elements. */
3802              $this->a_formatting[] = self::MARKER;
3803  
3804              /* An end tag whose tag name is "tr" */
3805          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3806              /* If the stack of open elements does not have an element in table
3807              scope with the same tag name as the token, this is a parse error.
3808              Ignore the token. (innerHTML case) */
3809              if (!$this->elementInScope($token['name'], true)) {
3810                  // Ignore.
3811  
3812                  /* Otherwise: */
3813              } else {
3814                  /* Clear the stack back to a table row context. */
3815                  $this->clearStackToTableContext($clear);
3816  
3817                  /* Pop the current node (which will be a tr element) from the
3818                  stack of open elements. Switch the insertion mode to "in table
3819                  body". */
3820                  array_pop($this->stack);
3821                  $this->mode = self::IN_TBODY;
3822              }
3823  
3824              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3825              "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3826          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3827                  $token['name'],
3828                  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3829              )
3830          ) {
3831              /* Act as if an end tag with the tag name "tr" had been seen, then,
3832              if that token wasn't ignored, reprocess the current token. */
3833              $this->inRow(
3834                  array(
3835                      'name' => 'tr',
3836                      'type' => HTML5::ENDTAG
3837                  )
3838              );
3839  
3840              return $this->inCell($token);
3841  
3842              /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3843          } elseif ($token['type'] === HTML5::ENDTAG &&
3844              in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3845          ) {
3846              /* If the stack of open elements does not have an element in table
3847              scope with the same tag name as the token, this is a parse error.
3848              Ignore the token. */
3849              if (!$this->elementInScope($token['name'], true)) {
3850                  // Ignore.
3851  
3852                  /* Otherwise: */
3853              } else {
3854                  /* Otherwise, act as if an end tag with the tag name "tr" had
3855                  been seen, then reprocess the current token. */
3856                  $this->inRow(
3857                      array(
3858                          'name' => 'tr',
3859                          'type' => HTML5::ENDTAG
3860                      )
3861                  );
3862  
3863                  return $this->inCell($token);
3864              }
3865  
3866              /* An end tag whose tag name is one of: "body", "caption", "col",
3867              "colgroup", "html", "td", "th" */
3868          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3869                  $token['name'],
3870                  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3871              )
3872          ) {
3873              /* Parse error. Ignore the token. */
3874  
3875              /* Anything else */
3876          } else {
3877              /* Process the token as if the insertion mode was "in table". */
3878              $this->inTable($token);
3879          }
3880      }
3881  
3882      private function inCell($token)
3883      {
3884          /* An end tag whose tag name is one of: "td", "th" */
3885          if ($token['type'] === HTML5::ENDTAG &&
3886              ($token['name'] === 'td' || $token['name'] === 'th')
3887          ) {
3888              /* If the stack of open elements does not have an element in table
3889              scope with the same tag name as that of the token, then this is a
3890              parse error and the token must be ignored. */
3891              if (!$this->elementInScope($token['name'], true)) {
3892                  // Ignore.
3893  
3894                  /* Otherwise: */
3895              } else {
3896                  /* Generate implied end tags, except for elements with the same
3897                  tag name as the token. */
3898                  $this->generateImpliedEndTags(array($token['name']));
3899  
3900                  /* Now, if the current node is not an element with the same tag
3901                  name as the token, then this is a parse error. */
3902                  // k
3903  
3904                  /* Pop elements from this stack until an element with the same
3905                  tag name as the token has been popped from the stack. */
3906                  while (true) {
3907                      $node = end($this->stack)->nodeName;
3908                      array_pop($this->stack);
3909  
3910                      if ($node === $token['name']) {
3911                          break;
3912                      }
3913                  }
3914  
3915                  /* Clear the list of active formatting elements up to the last
3916                  marker. */
3917                  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3918  
3919                  /* Switch the insertion mode to "in row". (The current node
3920                  will be a tr element at this point.) */
3921                  $this->mode = self::IN_ROW;
3922              }
3923  
3924              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3925              "tbody", "td", "tfoot", "th", "thead", "tr" */
3926          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3927                  $token['name'],
3928                  array(
3929                      'caption',
3930                      'col',
3931                      'colgroup',
3932                      'tbody',
3933                      'td',
3934                      'tfoot',
3935                      'th',
3936                      'thead',
3937                      'tr'
3938                  )
3939              )
3940          ) {
3941              /* If the stack of open elements does not have a td or th element
3942              in table scope, then this is a parse error; ignore the token.
3943              (innerHTML case) */
3944              if (!$this->elementInScope(array('td', 'th'), true)) {
3945                  // Ignore.
3946  
3947                  /* Otherwise, close the cell (see below) and reprocess the current
3948                  token. */
3949              } else {
3950                  $this->closeCell();
3951                  return $this->inRow($token);
3952              }
3953  
3954              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3955              "tbody", "td", "tfoot", "th", "thead", "tr" */
3956          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3957                  $token['name'],
3958                  array(
3959                      'caption',
3960                      'col',
3961                      'colgroup',
3962                      'tbody',
3963                      'td',
3964                      'tfoot',
3965                      'th',
3966                      'thead',
3967                      'tr'
3968                  )
3969              )
3970          ) {
3971              /* If the stack of open elements does not have a td or th element
3972              in table scope, then this is a parse error; ignore the token.
3973              (innerHTML case) */
3974              if (!$this->elementInScope(array('td', 'th'), true)) {
3975                  // Ignore.
3976  
3977                  /* Otherwise, close the cell (see below) and reprocess the current
3978                  token. */
3979              } else {
3980                  $this->closeCell();
3981                  return $this->inRow($token);
3982              }
3983  
3984              /* An end tag whose tag name is one of: "body", "caption", "col",
3985              "colgroup", "html" */
3986          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3987                  $token['name'],
3988                  array('body', 'caption', 'col', 'colgroup', 'html')
3989              )
3990          ) {
3991              /* Parse error. Ignore the token. */
3992  
3993              /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3994              "thead", "tr" */
3995          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3996                  $token['name'],
3997                  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3998              )
3999          ) {
4000              /* If the stack of open elements does not have an element in table
4001              scope with the same tag name as that of the token (which can only
4002              happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4003              then this is a parse error and the token must be ignored. */
4004              if (!$this->elementInScope($token['name'], true)) {
4005                  // Ignore.
4006  
4007                  /* Otherwise, close the cell (see below) and reprocess the current
4008                  token. */
4009              } else {
4010                  $this->closeCell();
4011                  return $this->inRow($token);
4012              }
4013  
4014              /* Anything else */
4015          } else {
4016              /* Process the token as if the insertion mode was "in body". */
4017              $this->inBody($token);
4018          }
4019      }
4020  
4021      private function inSelect($token)
4022      {
4023          /* Handle the token as follows: */
4024  
4025          /* A character token */
4026          if ($token['type'] === HTML5::CHARACTR) {
4027              /* Append the token's character to the current node. */
4028              $this->insertText($token['data']);
4029  
4030              /* A comment token */
4031          } elseif ($token['type'] === HTML5::COMMENT) {
4032              /* Append a Comment node to the current node with the data
4033              attribute set to the data given in the comment token. */
4034              $this->insertComment($token['data']);
4035  
4036              /* A start tag token whose tag name is "option" */
4037          } elseif ($token['type'] === HTML5::STARTTAG &&
4038              $token['name'] === 'option'
4039          ) {
4040              /* If the current node is an option element, act as if an end tag
4041              with the tag name "option" had been seen. */
4042              if (end($this->stack)->nodeName === 'option') {
4043                  $this->inSelect(
4044                      array(
4045                          'name' => 'option',
4046                          'type' => HTML5::ENDTAG
4047                      )
4048                  );
4049              }
4050  
4051              /* Insert an HTML element for the token. */
4052              $this->insertElement($token);
4053  
4054              /* A start tag token whose tag name is "optgroup" */
4055          } elseif ($token['type'] === HTML5::STARTTAG &&
4056              $token['name'] === 'optgroup'
4057          ) {
4058              /* If the current node is an option element, act as if an end tag
4059              with the tag name "option" had been seen. */
4060              if (end($this->stack)->nodeName === 'option') {
4061                  $this->inSelect(
4062                      array(
4063                          'name' => 'option',
4064                          'type' => HTML5::ENDTAG
4065                      )
4066                  );
4067              }
4068  
4069              /* If the current node is an optgroup element, act as if an end tag
4070              with the tag name "optgroup" had been seen. */
4071              if (end($this->stack)->nodeName === 'optgroup') {
4072                  $this->inSelect(
4073                      array(
4074                          'name' => 'optgroup',
4075                          'type' => HTML5::ENDTAG
4076                      )
4077                  );
4078              }
4079  
4080              /* Insert an HTML element for the token. */
4081              $this->insertElement($token);
4082  
4083              /* An end tag token whose tag name is "optgroup" */
4084          } elseif ($token['type'] === HTML5::ENDTAG &&
4085              $token['name'] === 'optgroup'
4086          ) {
4087              /* First, if the current node is an option element, and the node
4088              immediately before it in the stack of open elements is an optgroup
4089              element, then act as if an end tag with the tag name "option" had
4090              been seen. */
4091              $elements_in_stack = count($this->stack);
4092  
4093              if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4094                  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4095              ) {
4096                  $this->inSelect(
4097                      array(
4098                          'name' => 'option',
4099                          'type' => HTML5::ENDTAG
4100                      )
4101                  );
4102              }
4103  
4104              /* If the current node is an optgroup element, then pop that node
4105              from the stack of open elements. Otherwise, this is a parse error,
4106              ignore the token. */
4107              if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4108                  array_pop($this->stack);
4109              }
4110  
4111              /* An end tag token whose tag name is "option" */
4112          } elseif ($token['type'] === HTML5::ENDTAG &&
4113              $token['name'] === 'option'
4114          ) {
4115              /* If the current node is an option element, then pop that node
4116              from the stack of open elements. Otherwise, this is a parse error,
4117              ignore the token. */
4118              if (end($this->stack)->nodeName === 'option') {
4119                  array_pop($this->stack);
4120              }
4121  
4122              /* An end tag whose tag name is "select" */
4123          } elseif ($token['type'] === HTML5::ENDTAG &&
4124              $token['name'] === 'select'
4125          ) {
4126              /* If the stack of open elements does not have an element in table
4127              scope with the same tag name as the token, this is a parse error.
4128              Ignore the token. (innerHTML case) */
4129              if (!$this->elementInScope($token['name'], true)) {
4130                  // w/e
4131  
4132                  /* Otherwise: */
4133              } else {
4134                  /* Pop elements from the stack of open elements until a select
4135                  element has been popped from the stack. */
4136                  while (true) {
4137                      $current = end($this->stack)->nodeName;
4138                      array_pop($this->stack);
4139  
4140                      if ($current === 'select') {
4141                          break;
4142                      }
4143                  }
4144  
4145                  /* Reset the insertion mode appropriately. */
4146                  $this->resetInsertionMode();
4147              }
4148  
4149              /* A start tag whose tag name is "select" */
4150          } elseif ($token['name'] === 'select' &&
4151              $token['type'] === HTML5::STARTTAG
4152          ) {
4153              /* Parse error. Act as if the token had been an end tag with the
4154              tag name "select" instead. */
4155              $this->inSelect(
4156                  array(
4157                      'name' => 'select',
4158                      'type' => HTML5::ENDTAG
4159                  )
4160              );
4161  
4162              /* An end tag whose tag name is one of: "caption", "table", "tbody",
4163              "tfoot", "thead", "tr", "td", "th" */
4164          } elseif (in_array(
4165                  $token['name'],
4166                  array(
4167                      'caption',
4168                      'table',
4169                      'tbody',
4170                      'tfoot',
4171                      'thead',
4172                      'tr',
4173                      'td',
4174                      'th'
4175                  )
4176              ) && $token['type'] === HTML5::ENDTAG
4177          ) {
4178              /* Parse error. */
4179              // w/e
4180  
4181              /* If the stack of open elements has an element in table scope with
4182              the same tag name as that of the token, then act as if an end tag
4183              with the tag name "select" had been seen, and reprocess the token.
4184              Otherwise, ignore the token. */
4185              if ($this->elementInScope($token['name'], true)) {
4186                  $this->inSelect(
4187                      array(
4188                          'name' => 'select',
4189                          'type' => HTML5::ENDTAG
4190                      )
4191                  );
4192  
4193                  $this->mainPhase($token);
4194              }
4195  
4196              /* Anything else */
4197          } else {
4198              /* Parse error. Ignore the token. */
4199          }
4200      }
4201  
4202      private function afterBody($token)
4203      {
4204          /* Handle the token as follows: */
4205  
4206          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4207          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4208          or U+0020 SPACE */
4209          if ($token['type'] === HTML5::CHARACTR &&
4210              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4211          ) {
4212              /* Process the token as it would be processed if the insertion mode
4213              was "in body". */
4214              $this->inBody($token);
4215  
4216              /* A comment token */
4217          } elseif ($token['type'] === HTML5::COMMENT) {
4218              /* Append a Comment node to the first element in the stack of open
4219              elements (the html element), with the data attribute set to the
4220              data given in the comment token. */
4221              $comment = $this->dom->createComment($token['data']);
4222              $this->stack[0]->appendChild($comment);
4223  
4224              /* An end tag with the tag name "html" */
4225          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4226              /* If the parser was originally created in order to handle the
4227              setting of an element's innerHTML attribute, this is a parse error;
4228              ignore the token. (The element will be an html element in this
4229              case.) (innerHTML case) */
4230  
4231              /* Otherwise, switch to the trailing end phase. */
4232              $this->phase = self::END_PHASE;
4233  
4234              /* Anything else */
4235          } else {
4236              /* Parse error. Set the insertion mode to "in body" and reprocess
4237              the token. */
4238              $this->mode = self::IN_BODY;
4239              return $this->inBody($token);
4240          }
4241      }
4242  
4243      private function inFrameset($token)
4244      {
4245          /* Handle the token as follows: */
4246  
4247          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4248          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4249          U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4250          if ($token['type'] === HTML5::CHARACTR &&
4251              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4252          ) {
4253              /* Append the character to the current node. */
4254              $this->insertText($token['data']);
4255  
4256              /* A comment token */
4257          } elseif ($token['type'] === HTML5::COMMENT) {
4258              /* Append a Comment node to the current node with the data
4259              attribute set to the data given in the comment token. */
4260              $this->insertComment($token['data']);
4261  
4262              /* A start tag with the tag name "frameset" */
4263          } elseif ($token['name'] === 'frameset' &&
4264              $token['type'] === HTML5::STARTTAG
4265          ) {
4266              $this->insertElement($token);
4267  
4268              /* An end tag with the tag name "frameset" */
4269          } elseif ($token['name'] === 'frameset' &&
4270              $token['type'] === HTML5::ENDTAG
4271          ) {
4272              /* If the current node is the root html element, then this is a
4273              parse error; ignore the token. (innerHTML case) */
4274              if (end($this->stack)->nodeName === 'html') {
4275                  // Ignore
4276  
4277              } else {
4278                  /* Otherwise, pop the current node from the stack of open
4279                  elements. */
4280                  array_pop($this->stack);
4281  
4282                  /* If the parser was not originally created in order to handle
4283                  the setting of an element's innerHTML attribute (innerHTML case),
4284                  and the current node is no longer a frameset element, then change
4285                  the insertion mode to "after frameset". */
4286                  $this->mode = self::AFTR_FRAME;
4287              }
4288  
4289              /* A start tag with the tag name "frame" */
4290          } elseif ($token['name'] === 'frame' &&
4291              $token['type'] === HTML5::STARTTAG
4292          ) {
4293              /* Insert an HTML element for the token. */
4294              $this->insertElement($token);
4295  
4296              /* Immediately pop the current node off the stack of open elements. */
4297              array_pop($this->stack);
4298  
4299              /* A start tag with the tag name "noframes" */
4300          } elseif ($token['name'] === 'noframes' &&
4301              $token['type'] === HTML5::STARTTAG
4302          ) {
4303              /* Process the token as if the insertion mode had been "in body". */
4304              $this->inBody($token);
4305  
4306              /* Anything else */
4307          } else {
4308              /* Parse error. Ignore the token. */
4309          }
4310      }
4311  
4312      private function afterFrameset($token)
4313      {
4314          /* Handle the token as follows: */
4315  
4316          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4317          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4318          U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4319          if ($token['type'] === HTML5::CHARACTR &&
4320              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4321          ) {
4322              /* Append the character to the current node. */
4323              $this->insertText($token['data']);
4324  
4325              /* A comment token */
4326          } elseif ($token['type'] === HTML5::COMMENT) {
4327              /* Append a Comment node to the current node with the data
4328              attribute set to the data given in the comment token. */
4329              $this->insertComment($token['data']);
4330  
4331              /* An end tag with the tag name "html" */
4332          } elseif ($token['name'] === 'html' &&
4333              $token['type'] === HTML5::ENDTAG
4334          ) {
4335              /* Switch to the trailing end phase. */
4336              $this->phase = self::END_PHASE;
4337  
4338              /* A start tag with the tag name "noframes" */
4339          } elseif ($token['name'] === 'noframes' &&
4340              $token['type'] === HTML5::STARTTAG
4341          ) {
4342              /* Process the token as if the insertion mode had been "in body". */
4343              $this->inBody($token);
4344  
4345              /* Anything else */
4346          } else {
4347              /* Parse error. Ignore the token. */
4348          }
4349      }
4350  
4351      private function trailingEndPhase($token)
4352      {
4353          /* After the main phase, as each token is emitted from the tokenisation
4354          stage, it must be processed as described in this section. */
4355  
4356          /* A DOCTYPE token */
4357          if ($token['type'] === HTML5::DOCTYPE) {
4358              // Parse error. Ignore the token.
4359  
4360              /* A comment token */
4361          } elseif ($token['type'] === HTML5::COMMENT) {
4362              /* Append a Comment node to the Document object with the data
4363              attribute set to the data given in the comment token. */
4364              $comment = $this->dom->createComment($token['data']);
4365              $this->dom->appendChild($comment);
4366  
4367              /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4368              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4369              or U+0020 SPACE */
4370          } elseif ($token['type'] === HTML5::CHARACTR &&
4371              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4372          ) {
4373              /* Process the token as it would be processed in the main phase. */
4374              $this->mainPhase($token);
4375  
4376              /* A character token that is not one of U+0009 CHARACTER TABULATION,
4377              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4378              or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4379          } elseif (($token['type'] === HTML5::CHARACTR &&
4380                  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4381              $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4382          ) {
4383              /* Parse error. Switch back to the main phase and reprocess the
4384              token. */
4385              $this->phase = self::MAIN_PHASE;
4386              return $this->mainPhase($token);
4387  
4388              /* An end-of-file token */
4389          } elseif ($token['type'] === HTML5::EOF) {
4390              /* OMG DONE!! */
4391          }
4392      }
4393  
4394      private function insertElement($token, $append = true, $check = false)
4395      {
4396          // Proprietary workaround for libxml2's limitations with tag names
4397          if ($check) {
4398              // Slightly modified HTML5 tag-name modification,
4399              // removing anything that's not an ASCII letter, digit, or hyphen
4400              $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4401              // Remove leading hyphens and numbers
4402              $token['name'] = ltrim($token['name'], '-0..9');
4403              // In theory, this should ever be needed, but just in case
4404              if ($token['name'] === '') {
4405                  $token['name'] = 'span';
4406              } // arbitrary generic choice
4407          }
4408  
4409          $el = $this->dom->createElement($token['name']);
4410  
4411          foreach ($token['attr'] as $attr) {
4412              if (!$el->hasAttribute($attr['name'])) {
4413                  $el->setAttribute($attr['name'], (string)$attr['value']);
4414              }
4415          }
4416  
4417          $this->appendToRealParent($el);
4418          $this->stack[] = $el;
4419  
4420          return $el;
4421      }
4422  
4423      private function insertText($data)
4424      {
4425          $text = $this->dom->createTextNode($data);
4426          $this->appendToRealParent($text);
4427      }
4428  
4429      private function insertComment($data)
4430      {
4431          $comment = $this->dom->createComment($data);
4432          $this->appendToRealParent($comment);
4433      }
4434  
4435      private function appendToRealParent($node)
4436      {
4437          if ($this->foster_parent === null) {
4438              end($this->stack)->appendChild($node);
4439  
4440          } elseif ($this->foster_parent !== null) {
4441              /* If the foster parent element is the parent element of the
4442              last table element in the stack of open elements, then the new
4443              node must be inserted immediately before the last table element
4444              in the stack of open elements in the foster parent element;
4445              otherwise, the new node must be appended to the foster parent
4446              element. */
4447              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4448                  if ($this->stack[$n]->nodeName === 'table' &&
4449                      $this->stack[$n]->parentNode !== null
4450                  ) {
4451                      $table = $this->stack[$n];
4452                      break;
4453                  }
4454              }
4455  
4456              if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4457                  $this->foster_parent->insertBefore($node, $table);
4458              } else {
4459                  $this->foster_parent->appendChild($node);
4460              }
4461  
4462              $this->foster_parent = null;
4463          }
4464      }
4465  
4466      private function elementInScope($el, $table = false)
4467      {
4468          if (is_array($el)) {
4469              foreach ($el as $element) {
4470                  if ($this->elementInScope($element, $table)) {
4471                      return true;
4472                  }
4473              }
4474  
4475              return false;
4476          }
4477  
4478          $leng = count($this->stack);
4479  
4480          for ($n = 0; $n < $leng; $n++) {
4481              /* 1. Initialise node to be the current node (the bottommost node of
4482              the stack). */
4483              $node = $this->stack[$leng - 1 - $n];
4484  
4485              if ($node->tagName === $el) {
4486                  /* 2. If node is the target node, terminate in a match state. */
4487                  return true;
4488  
4489              } elseif ($node->tagName === 'table') {
4490                  /* 3. Otherwise, if node is a table element, terminate in a failure
4491                  state. */
4492                  return false;
4493  
4494              } elseif ($table === true && in_array(
4495                      $node->tagName,
4496                      array(
4497                          'caption',
4498                          'td',
4499                          'th',
4500                          'button',
4501                          'marquee',
4502                          'object'
4503                      )
4504                  )
4505              ) {
4506                  /* 4. Otherwise, if the algorithm is the "has an element in scope"
4507                  variant (rather than the "has an element in table scope" variant),
4508                  and node is one of the following, terminate in a failure state. */
4509                  return false;
4510  
4511              } elseif ($node === $node->ownerDocument->documentElement) {
4512                  /* 5. Otherwise, if node is an html element (root element), terminate
4513                  in a failure state. (This can only happen if the node is the topmost
4514                  node of the    stack of open elements, and prevents the next step from
4515                  being invoked if there are no more elements in the stack.) */
4516                  return false;
4517              }
4518  
4519              /* Otherwise, set node to the previous entry in the stack of open
4520              elements and return to step 2. (This will never fail, since the loop
4521              will always terminate in the previous step if the top of the stack
4522              is reached.) */
4523          }
4524      }
4525  
4526      private function reconstructActiveFormattingElements()
4527      {
4528          /* 1. If there are no entries in the list of active formatting elements,
4529          then there is nothing to reconstruct; stop this algorithm. */
4530          $formatting_elements = count($this->a_formatting);
4531  
4532          if ($formatting_elements === 0) {
4533              return false;
4534          }
4535  
4536          /* 3. Let entry be the last (most recently added) element in the list
4537          of active formatting elements. */
4538          $entry = end($this->a_formatting);
4539  
4540          /* 2. If the last (most recently added) entry in the list of active
4541          formatting elements is a marker, or if it is an element that is in the
4542          stack of open elements, then there is nothing to reconstruct; stop this
4543          algorithm. */
4544          if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4545              return false;
4546          }
4547  
4548          for ($a = $formatting_elements - 1; $a >= 0; true) {
4549              /* 4. If there are no entries before entry in the list of active
4550              formatting elements, then jump to step 8. */
4551              if ($a === 0) {
4552                  $step_seven = false;
4553                  break;
4554              }
4555  
4556              /* 5. Let entry be the entry one earlier than entry in the list of
4557              active formatting elements. */
4558              $a--;
4559              $entry = $this->a_formatting[$a];
4560  
4561              /* 6. If entry is neither a marker nor an element that is also in
4562              thetack of open elements, go to step 4. */
4563              if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4564                  break;
4565              }
4566          }
4567  
4568          while (true) {
4569              /* 7. Let entry be the element one later than entry in the list of
4570              active formatting elements. */
4571              if (isset($step_seven) && $step_seven === true) {
4572                  $a++;
4573                  $entry = $this->a_formatting[$a];
4574              }
4575  
4576              /* 8. Perform a shallow clone of the element entry to obtain clone. */
4577              $clone = $entry->cloneNode();
4578  
4579              /* 9. Append clone to the current node and push it onto the stack
4580              of open elements  so that it is the new current node. */
4581              end($this->stack)->appendChild($clone);
4582              $this->stack[] = $clone;
4583  
4584              /* 10. Replace the entry for entry in the list with an entry for
4585              clone. */
4586              $this->a_formatting[$a] = $clone;
4587  
4588              /* 11. If the entry for clone in the list of active formatting
4589              elements is not the last entry in the list, return to step 7. */
4590              if (end($this->a_formatting) !== $clone) {
4591                  $step_seven = true;
4592              } else {
4593                  break;
4594              }
4595          }
4596      }
4597  
4598      private function clearTheActiveFormattingElementsUpToTheLastMarker()
4599      {
4600          /* When the steps below require the UA to clear the list of active
4601          formatting elements up to the last marker, the UA must perform the
4602          following steps: */
4603  
4604          while (true) {
4605              /* 1. Let entry be the last (most recently added) entry in the list
4606              of active formatting elements. */
4607              $entry = end($this->a_formatting);
4608  
4609              /* 2. Remove entry from the list of active formatting elements. */
4610              array_pop($this->a_formatting);
4611  
4612              /* 3. If entry was a marker, then stop the algorithm at this point.
4613              The list has been cleared up to the last marker. */
4614              if ($entry === self::MARKER) {
4615                  break;
4616              }
4617          }
4618      }
4619  
4620      private function generateImpliedEndTags($exclude = array())
4621      {
4622          /* When the steps below require the UA to generate implied end tags,
4623          then, if the current node is a dd element, a dt element, an li element,
4624          a p element, a td element, a th  element, or a tr element, the UA must
4625          act as if an end tag with the respective tag name had been seen and
4626          then generate implied end tags again. */
4627          $node = end($this->stack);
4628          $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4629  
4630          while (in_array(end($this->stack)->nodeName, $elements)) {
4631              array_pop($this->stack);
4632          }
4633      }
4634  
4635      private function getElementCategory($node)
4636      {
4637          $name = $node->tagName;
4638          if (in_array($name, $this->special)) {
4639              return self::SPECIAL;
4640          } elseif (in_array($name, $this->scoping)) {
4641              return self::SCOPING;
4642          } elseif (in_array($name, $this->formatting)) {
4643              return self::FORMATTING;
4644          } else {
4645              return self::PHRASING;
4646          }
4647      }
4648  
4649      private function clearStackToTableContext($elements)
4650      {
4651          /* When the steps above require the UA to clear the stack back to a
4652          table context, it means that the UA must, while the current node is not
4653          a table element or an html element, pop elements from the stack of open
4654          elements. If this causes any elements to be popped from the stack, then
4655          this is a parse error. */
4656          while (true) {
4657              $node = end($this->stack)->nodeName;
4658  
4659              if (in_array($node, $elements)) {
4660                  break;
4661              } else {
4662                  array_pop($this->stack);
4663              }
4664          }
4665      }
4666  
4667      private function resetInsertionMode()
4668      {
4669          /* 1. Let last be false. */
4670          $last = false;
4671          $leng = count($this->stack);
4672  
4673          for ($n = $leng - 1; $n >= 0; $n--) {
4674              /* 2. Let node be the last node in the stack of open elements. */
4675              $node = $this->stack[$n];
4676  
4677              /* 3. If node is the first node in the stack of open elements, then
4678              set last to true. If the element whose innerHTML  attribute is being
4679              set is neither a td  element nor a th element, then set node to the
4680              element whose innerHTML  attribute is being set. (innerHTML  case) */
4681              if ($this->stack[0]->isSameNode($node)) {
4682                  $last = true;
4683              }
4684  
4685              /* 4. If node is a select element, then switch the insertion mode to
4686              "in select" and abort these steps. (innerHTML case) */
4687              if ($node->nodeName === 'select') {
4688                  $this->mode = self::IN_SELECT;
4689                  break;
4690  
4691                  /* 5. If node is a td or th element, then switch the insertion mode
4692                  to "in cell" and abort these steps. */
4693              } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4694                  $this->mode = self::IN_CELL;
4695                  break;
4696  
4697                  /* 6. If node is a tr element, then switch the insertion mode to
4698                  "in    row" and abort these steps. */
4699              } elseif ($node->nodeName === 'tr') {
4700                  $this->mode = self::IN_ROW;
4701                  break;
4702  
4703                  /* 7. If node is a tbody, thead, or tfoot element, then switch the
4704                  insertion mode to "in table body" and abort these steps. */
4705              } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4706                  $this->mode = self::IN_TBODY;
4707                  break;
4708  
4709                  /* 8. If node is a caption element, then switch the insertion mode
4710                  to "in caption" and abort these steps. */
4711              } elseif ($node->nodeName === 'caption') {
4712                  $this->mode = self::IN_CAPTION;
4713                  break;
4714  
4715                  /* 9. If node is a colgroup element, then switch the insertion mode
4716                  to "in column group" and abort these steps. (innerHTML case) */
4717              } elseif ($node->nodeName === 'colgroup') {
4718                  $this->mode = self::IN_CGROUP;
4719                  break;
4720  
4721                  /* 10. If node is a table element, then switch the insertion mode
4722                  to "in table" and abort these steps. */
4723              } elseif ($node->nodeName === 'table') {
4724                  $this->mode = self::IN_TABLE;
4725                  break;
4726  
4727                  /* 11. If node is a head element, then switch the insertion mode
4728                  to "in body" ("in body"! not "in head"!) and abort these steps.
4729                  (innerHTML case) */
4730              } elseif ($node->nodeName === 'head') {
4731                  $this->mode = self::IN_BODY;
4732                  break;
4733  
4734                  /* 12. If node is a body element, then switch the insertion mode to
4735                  "in body" and abort these steps. */
4736              } elseif ($node->nodeName === 'body') {
4737                  $this->mode = self::IN_BODY;
4738                  break;
4739  
4740                  /* 13. If node is a frameset element, then switch the insertion
4741                  mode to "in frameset" and abort these steps. (innerHTML case) */
4742              } elseif ($node->nodeName === 'frameset') {
4743                  $this->mode = self::IN_FRAME;
4744                  break;
4745  
4746                  /* 14. If node is an html element, then: if the head element
4747                  pointer is null, switch the insertion mode to "before head",
4748                  otherwise, switch the insertion mode to "after head". In either
4749                  case, abort these steps. (innerHTML case) */
4750              } elseif ($node->nodeName === 'html') {
4751                  $this->mode = ($this->head_pointer === null)
4752                      ? self::BEFOR_HEAD
4753                      : self::AFTER_HEAD;
4754  
4755                  break;
4756  
4757                  /* 15. If last is true, then set the insertion mode to "in body"
4758                  and    abort these steps. (innerHTML case) */
4759              } elseif ($last) {
4760                  $this->mode = self::IN_BODY;
4761                  break;
4762              }
4763          }
4764      }
4765  
4766      private function closeCell()
4767      {
4768          /* If the stack of open elements has a td or th element in table scope,
4769          then act as if an end tag token with that tag name had been seen. */
4770          foreach (array('td', 'th') as $cell) {
4771              if ($this->elementInScope($cell, true)) {
4772                  $this->inCell(
4773                      array(
4774                          'name' => $cell,
4775                          'type' => HTML5::ENDTAG
4776                      )
4777                  );
4778  
4779                  break;
4780              }
4781          }
4782      }
4783  
4784      public function save()
4785      {
4786          return $this->dom;
4787      }
4788  }