Moodle 4.2 XRef and Diffs

Search moodle.org's
Developer Documentation
Bug fixes for general core bugs in 4.2.x will end 22 April 2024 (12 months).
Bug fixes for security issues in 4.2.x will end 7 October 2024 (18 months).
PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.1.x is supported too.
Moodle 4.2 Database Schema (by Marcus Green)
/lib/htmlpurifier/HTMLPurifier/Lexer/ -> DOMLex.php (source)
   1  <?php
   2  
   3  /**
   4   * Parser that uses PHP 5's DOM extension (part of the core).
   5   *
   6   * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
   7   * It gives us a forgiving HTML parser, which we use to transform the HTML
   8   * into a DOM, and then into the tokens.  It is blazingly fast (for large
   9   * documents, it performs twenty times faster than
  10   * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
  11   *
  12   * @note Any empty elements will have empty tokens associated with them, even if
  13   * this is prohibited by the spec. This is cannot be fixed until the spec
  14   * comes into play.
  15   *
  16   * @note PHP's DOM extension does not actually parse any entities, we use
  17   *       our own function to do that.
  18   *
  19   * @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
  20   *          If this is a huge problem, due to the fact that HTML is hand
  21   *          edited and you are unable to get a parser cache that caches the
  22   *          the output of HTML Purifier while keeping the original HTML lying
  23   *          around, you may want to run Tidy on the resulting output or use
  24   *          HTMLPurifier_DirectLex
  25   */
  26  
  27  class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
  28  {
  29  
  30      /**
  31       * @type HTMLPurifier_TokenFactory
  32       */
  33      private $factory;
  34  
  35      public function __construct()
  36      {
  37          // setup the factory
  38          parent::__construct();
  39          $this->factory = new HTMLPurifier_TokenFactory();
  40      }
  41  
  42      /**
  43       * @param string $html
  44       * @param HTMLPurifier_Config $config
  45       * @param HTMLPurifier_Context $context
  46       * @return HTMLPurifier_Token[]
  47       */
  48      public function tokenizeHTML($html, $config, $context)
  49      {
  50          $html = $this->normalize($html, $config, $context);
  51  
  52          // attempt to armor stray angled brackets that cannot possibly
  53          // form tags and thus are probably being used as emoticons
  54          if ($config->get('Core.AggressivelyFixLt')) {
  55              $char = '[^a-z!\/]';
  56              $comment = "/<!--(.*?)(-->|\z)/is";
  57              $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
  58              do {
  59                  $old = $html;
  60                  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
  61              } while ($html !== $old);
  62              $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
  63          }
  64  
  65          // preprocess html, essential for UTF-8
  66          $html = $this->wrapHTML($html, $config, $context);
  67  
  68          $doc = new DOMDocument();
  69          $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
  70  
  71          $options = 0;
  72          if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) {
  73              $options |= LIBXML_PARSEHUGE;
  74          }
  75  
  76          set_error_handler(array($this, 'muteErrorHandler'));
  77          // loadHTML() fails on PHP 5.3 when second parameter is given
  78          if ($options) {
  79              $doc->loadHTML($html, $options);
  80          } else {
  81              $doc->loadHTML($html);
  82          }
  83          restore_error_handler();
  84  
  85          $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
  86                        getElementsByTagName('body')->item(0);  // <body>
  87  
  88          $div = $body->getElementsByTagName('div')->item(0); // <div>
  89          $tokens = array();
  90          $this->tokenizeDOM($div, $tokens, $config);
  91          // If the div has a sibling, that means we tripped across
  92          // a premature </div> tag.  So remove the div we parsed,
  93          // and then tokenize the rest of body.  We can't tokenize
  94          // the sibling directly as we'll lose the tags in that case.
  95          if ($div->nextSibling) {
  96              $body->removeChild($div);
  97              $this->tokenizeDOM($body, $tokens, $config);
  98          }
  99          return $tokens;
 100      }
 101  
 102      /**
 103       * Iterative function that tokenizes a node, putting it into an accumulator.
 104       * To iterate is human, to recurse divine - L. Peter Deutsch
 105       * @param DOMNode $node DOMNode to be tokenized.
 106       * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
 107       * @return HTMLPurifier_Token of node appended to previously passed tokens.
 108       */
 109      protected function tokenizeDOM($node, &$tokens, $config)
 110      {
 111          $level = 0;
 112          $nodes = array($level => new HTMLPurifier_Queue(array($node)));
 113          $closingNodes = array();
 114          do {
 115              while (!$nodes[$level]->isEmpty()) {
 116                  $node = $nodes[$level]->shift(); // FIFO
 117                  $collect = $level > 0 ? true : false;
 118                  $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
 119                  if ($needEndingTag) {
 120                      $closingNodes[$level][] = $node;
 121                  }
 122                  if ($node->childNodes && $node->childNodes->length) {
 123                      $level++;
 124                      $nodes[$level] = new HTMLPurifier_Queue();
 125                      foreach ($node->childNodes as $childNode) {
 126                          $nodes[$level]->push($childNode);
 127                      }
 128                  }
 129              }
 130              $level--;
 131              if ($level && isset($closingNodes[$level])) {
 132                  while ($node = array_pop($closingNodes[$level])) {
 133                      $this->createEndNode($node, $tokens);
 134                  }
 135              }
 136          } while ($level > 0);
 137      }
 138  
 139      /**
 140       * Portably retrieve the tag name of a node; deals with older versions
 141       * of libxml like 2.7.6
 142       * @param DOMNode $node
 143       */
 144      protected function getTagName($node)
 145      {
 146          if (isset($node->tagName)) {
 147              return $node->tagName;
 148          } else if (isset($node->nodeName)) {
 149              return $node->nodeName;
 150          } else if (isset($node->localName)) {
 151              return $node->localName;
 152          }
 153          return null;
 154      }
 155  
 156      /**
 157       * Portably retrieve the data of a node; deals with older versions
 158       * of libxml like 2.7.6
 159       * @param DOMNode $node
 160       */
 161      protected function getData($node)
 162      {
 163          if (isset($node->data)) {
 164              return $node->data;
 165          } else if (isset($node->nodeValue)) {
 166              return $node->nodeValue;
 167          } else if (isset($node->textContent)) {
 168              return $node->textContent;
 169          }
 170          return null;
 171      }
 172  
 173  
 174      /**
 175       * @param DOMNode $node DOMNode to be tokenized.
 176       * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
 177       * @param bool $collect  Says whether or start and close are collected, set to
 178       *                    false at first recursion because it's the implicit DIV
 179       *                    tag you're dealing with.
 180       * @return bool if the token needs an endtoken
 181       * @todo data and tagName properties don't seem to exist in DOMNode?
 182       */
 183      protected function createStartNode($node, &$tokens, $collect, $config)
 184      {
 185          // intercept non element nodes. WE MUST catch all of them,
 186          // but we're not getting the character reference nodes because
 187          // those should have been preprocessed
 188          if ($node->nodeType === XML_TEXT_NODE) {
 189              $data = $this->getData($node); // Handle variable data property
 190              if ($data !== null) {
 191                $tokens[] = $this->factory->createText($data);
 192              }
 193              return false;
 194          } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
 195              // undo libxml's special treatment of <script> and <style> tags
 196              $last = end($tokens);
 197              $data = $node->data;
 198              // (note $node->tagname is already normalized)
 199              if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
 200                  $new_data = trim($data);
 201                  if (substr($new_data, 0, 4) === '<!--') {
 202                      $data = substr($new_data, 4);
 203                      if (substr($data, -3) === '-->') {
 204                          $data = substr($data, 0, -3);
 205                      } else {
 206                          // Highly suspicious! Not sure what to do...
 207                      }
 208                  }
 209              }
 210              $tokens[] = $this->factory->createText($this->parseText($data, $config));
 211              return false;
 212          } elseif ($node->nodeType === XML_COMMENT_NODE) {
 213              // this is code is only invoked for comments in script/style in versions
 214              // of libxml pre-2.6.28 (regular comments, of course, are still
 215              // handled regularly)
 216              $tokens[] = $this->factory->createComment($node->data);
 217              return false;
 218          } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
 219              // not-well tested: there may be other nodes we have to grab
 220              return false;
 221          }
 222          $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
 223          $tag_name = $this->getTagName($node); // Handle variable tagName property
 224          if (empty($tag_name)) {
 225              return (bool) $node->childNodes->length;
 226          }
 227          // We still have to make sure that the element actually IS empty
 228          if (!$node->childNodes->length) {
 229              if ($collect) {
 230                  $tokens[] = $this->factory->createEmpty($tag_name, $attr);
 231              }
 232              return false;
 233          } else {
 234              if ($collect) {
 235                  $tokens[] = $this->factory->createStart($tag_name, $attr);
 236              }
 237              return true;
 238          }
 239      }
 240  
 241      /**
 242       * @param DOMNode $node
 243       * @param HTMLPurifier_Token[] $tokens
 244       */
 245      protected function createEndNode($node, &$tokens)
 246      {
 247          $tag_name = $this->getTagName($node); // Handle variable tagName property
 248          $tokens[] = $this->factory->createEnd($tag_name);
 249      }
 250  
 251      /**
 252       * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
 253       *
 254       * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.
 255       * @return array Associative array of attributes.
 256       */
 257      protected function transformAttrToAssoc($node_map)
 258      {
 259          // NamedNodeMap is documented very well, so we're using undocumented
 260          // features, namely, the fact that it implements Iterator and
 261          // has a ->length attribute
 262          if ($node_map->length === 0) {
 263              return array();
 264          }
 265          $array = array();
 266          foreach ($node_map as $attr) {
 267              $array[$attr->name] = $attr->value;
 268          }
 269          return $array;
 270      }
 271  
 272      /**
 273       * An error handler that mutes all errors
 274       * @param int $errno
 275       * @param string $errstr
 276       */
 277      public function muteErrorHandler($errno, $errstr)
 278      {
 279      }
 280  
 281      /**
 282       * Callback function for undoing escaping of stray angled brackets
 283       * in comments
 284       * @param array $matches
 285       * @return string
 286       */
 287      public function callbackUndoCommentSubst($matches)
 288      {
 289          return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
 290      }
 291  
 292      /**
 293       * Callback function that entity-izes ampersands in comments so that
 294       * callbackUndoCommentSubst doesn't clobber them
 295       * @param array $matches
 296       * @return string
 297       */
 298      public function callbackArmorCommentEntities($matches)
 299      {
 300          return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
 301      }
 302  
 303      /**
 304       * Wraps an HTML fragment in the necessary HTML
 305       * @param string $html
 306       * @param HTMLPurifier_Config $config
 307       * @param HTMLPurifier_Context $context
 308       * @return string
 309       */
 310      protected function wrapHTML($html, $config, $context, $use_div = true)
 311      {
 312          $def = $config->getDefinition('HTML');
 313          $ret = '';
 314  
 315          if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
 316              $ret .= '<!DOCTYPE html ';
 317              if (!empty($def->doctype->dtdPublic)) {
 318                  $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
 319              }
 320              if (!empty($def->doctype->dtdSystem)) {
 321                  $ret .= '"' . $def->doctype->dtdSystem . '" ';
 322              }
 323              $ret .= '>';
 324          }
 325  
 326          $ret .= '<html><head>';
 327          $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
 328          // No protection if $html contains a stray </div>!
 329          $ret .= '</head><body>';
 330          if ($use_div) $ret .= '<div>';
 331          $ret .= $html;
 332          if ($use_div) $ret .= '</div>';
 333          $ret .= '</body></html>';
 334          return $ret;
 335      }
 336  }
 337  
 338  // vim: et sw=4 sts=4