Search moodle.org's
Developer Documentation

See Release Notes
Long Term Support Release

  • Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
  • Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
  • PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.

Differences Between: [Versions 310 and 401] [Versions 311 and 401] [Versions 39 and 401] [Versions 400 and 401]

   1  <?php
   2  
   3  /**
   4   * Forgivingly lexes HTML (SGML-style) markup into tokens.
   5   *
   6   * A lexer parses a string of SGML-style markup and converts them into
   7   * corresponding tokens.  It doesn't check for well-formedness, although its
   8   * internal mechanism may make this automatic (such as the case of
   9   * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
  10   * from.
  11   *
  12   * A lexer is HTML-oriented: it might work with XML, but it's not
  13   * recommended, as we adhere to a subset of the specification for optimization
  14   * reasons. This might change in the future. Also, most tokenizers are not
  15   * expected to handle DTDs or PIs.
  16   *
  17   * This class should not be directly instantiated, but you may use create() to
  18   * retrieve a default copy of the lexer.  Being a supertype, this class
  19   * does not actually define any implementation, but offers commonly used
  20   * convenience functions for subclasses.
  21   *
  22   * @note The unit tests will instantiate this class for testing purposes, as
  23   *       many of the utility functions require a class to be instantiated.
  24   *       This means that, even though this class is not runnable, it will
  25   *       not be declared abstract.
  26   *
  27   * @par
  28   *
  29   * @note
  30   * We use tokens rather than create a DOM representation because DOM would:
  31   *
  32   * @par
  33   *  -# Require more processing and memory to create,
  34   *  -# Is not streamable, and
  35   *  -# Has the entire document structure (html and body not needed).
  36   *
  37   * @par
  38   * However, DOM is helpful in that it makes it easy to move around nodes
  39   * without a lot of lookaheads to see when a tag is closed. This is a
  40   * limitation of the token system and some workarounds would be nice.
  41   */
  42  class HTMLPurifier_Lexer
  43  {
  44  
  45      /**
  46       * Whether or not this lexer implements line-number/column-number tracking.
  47       * If it does, set to true.
  48       */
  49      public $tracksLineNumbers = false;
  50  
  51      /**
  52       * @type HTMLPurifier_EntityParser
  53       */
  54      private $_entity_parser;
  55  
  56      // -- STATIC ----------------------------------------------------------
  57  
  58      /**
  59       * Retrieves or sets the default Lexer as a Prototype Factory.
  60       *
  61       * By default HTMLPurifier_Lexer_DOMLex will be returned. There are
  62       * a few exceptions involving special features that only DirectLex
  63       * implements.
  64       *
  65       * @note The behavior of this class has changed, rather than accepting
  66       *       a prototype object, it now accepts a configuration object.
  67       *       To specify your own prototype, set %Core.LexerImpl to it.
  68       *       This change in behavior de-singletonizes the lexer object.
  69       *
  70       * @param HTMLPurifier_Config $config
  71       * @return HTMLPurifier_Lexer
  72       * @throws HTMLPurifier_Exception
  73       */
  74      public static function create($config)
  75      {
  76          if (!($config instanceof HTMLPurifier_Config)) {
  77              $lexer = $config;
  78              trigger_error(
  79                  "Passing a prototype to
  80                  HTMLPurifier_Lexer::create() is deprecated, please instead
  81                  use %Core.LexerImpl",
  82                  E_USER_WARNING
  83              );
  84          } else {
  85              $lexer = $config->get('Core.LexerImpl');
  86          }
  87  
  88          $needs_tracking =
  89              $config->get('Core.MaintainLineNumbers') ||
  90              $config->get('Core.CollectErrors');
  91  
  92          $inst = null;
  93          if (is_object($lexer)) {
  94              $inst = $lexer;
  95          } else {
  96              if (is_null($lexer)) {
  97                  do {
  98                      // auto-detection algorithm
  99                      if ($needs_tracking) {
 100                          $lexer = 'DirectLex';
 101                          break;
 102                      }
 103  
 104                      if (class_exists('DOMDocument', false) &&
 105                          method_exists('DOMDocument', 'loadHTML') &&
 106                          !extension_loaded('domxml')
 107                      ) {
 108                          // check for DOM support, because while it's part of the
 109                          // core, it can be disabled compile time. Also, the PECL
 110                          // domxml extension overrides the default DOM, and is evil
 111                          // and nasty and we shan't bother to support it
 112                          $lexer = 'DOMLex';
 113                      } else {
 114                          $lexer = 'DirectLex';
 115                      }
 116                  } while (0);
 117              } // do..while so we can break
 118  
 119              // instantiate recognized string names
 120              switch ($lexer) {
 121                  case 'DOMLex':
 122                      $inst = new HTMLPurifier_Lexer_DOMLex();
 123                      break;
 124                  case 'DirectLex':
 125                      $inst = new HTMLPurifier_Lexer_DirectLex();
 126                      break;
 127                  case 'PH5P':
 128                      $inst = new HTMLPurifier_Lexer_PH5P();
 129                      break;
 130                  default:
 131                      throw new HTMLPurifier_Exception(
 132                          "Cannot instantiate unrecognized Lexer type " .
 133                          htmlspecialchars($lexer)
 134                      );
 135              }
 136          }
 137  
 138          if (!$inst) {
 139              throw new HTMLPurifier_Exception('No lexer was instantiated');
 140          }
 141  
 142          // once PHP DOM implements native line numbers, or we
 143          // hack out something using XSLT, remove this stipulation
 144          if ($needs_tracking && !$inst->tracksLineNumbers) {
 145              throw new HTMLPurifier_Exception(
 146                  'Cannot use lexer that does not support line numbers with ' .
 147                  'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
 148              );
 149          }
 150  
 151          return $inst;
 152  
 153      }
 154  
 155      // -- CONVENIENCE MEMBERS ---------------------------------------------
 156  
 157      public function __construct()
 158      {
 159          $this->_entity_parser = new HTMLPurifier_EntityParser();
 160      }
 161  
 162      /**
 163       * Most common entity to raw value conversion table for special entities.
 164       * @type array
 165       */
 166      protected $_special_entity2str =
 167          array(
 168              '&quot;' => '"',
 169              '&amp;' => '&',
 170              '&lt;' => '<',
 171              '&gt;' => '>',
 172              '&#39;' => "'",
 173              '&#039;' => "'",
 174              '&#x27;' => "'"
 175          );
 176  
 177      public function parseText($string, $config) {
 178          return $this->parseData($string, false, $config);
 179      }
 180  
 181      public function parseAttr($string, $config) {
 182          return $this->parseData($string, true, $config);
 183      }
 184  
 185      /**
 186       * Parses special entities into the proper characters.
 187       *
 188       * This string will translate escaped versions of the special characters
 189       * into the correct ones.
 190       *
 191       * @param string $string String character data to be parsed.
 192       * @return string Parsed character data.
 193       */
 194      public function parseData($string, $is_attr, $config)
 195      {
 196          // following functions require at least one character
 197          if ($string === '') {
 198              return '';
 199          }
 200  
 201          // subtracts amps that cannot possibly be escaped
 202          $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
 203              ($string[strlen($string) - 1] === '&' ? 1 : 0);
 204  
 205          if (!$num_amp) {
 206              return $string;
 207          } // abort if no entities
 208          $num_esc_amp = substr_count($string, '&amp;');
 209          $string = strtr($string, $this->_special_entity2str);
 210  
 211          // code duplication for sake of optimization, see above
 212          $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
 213              ($string[strlen($string) - 1] === '&' ? 1 : 0);
 214  
 215          if ($num_amp_2 <= $num_esc_amp) {
 216              return $string;
 217          }
 218  
 219          // hmm... now we have some uncommon entities. Use the callback.
 220          if ($config->get('Core.LegacyEntityDecoder')) {
 221              $string = $this->_entity_parser->substituteSpecialEntities($string);
 222          } else {
 223              if ($is_attr) {
 224                  $string = $this->_entity_parser->substituteAttrEntities($string);
 225              } else {
 226                  $string = $this->_entity_parser->substituteTextEntities($string);
 227              }
 228          }
 229          return $string;
 230      }
 231  
 232      /**
 233       * Lexes an HTML string into tokens.
 234       * @param $string String HTML.
 235       * @param HTMLPurifier_Config $config
 236       * @param HTMLPurifier_Context $context
 237       * @return HTMLPurifier_Token[] array representation of HTML.
 238       */
 239      public function tokenizeHTML($string, $config, $context)
 240      {
 241          trigger_error('Call to abstract class', E_USER_ERROR);
 242      }
 243  
 244      /**
 245       * Translates CDATA sections into regular sections (through escaping).
 246       * @param string $string HTML string to process.
 247       * @return string HTML with CDATA sections escaped.
 248       */
 249      protected static function escapeCDATA($string)
 250      {
 251          return preg_replace_callback(
 252              '/<!\[CDATA\[(.+?)\]\]>/s',
 253              array('HTMLPurifier_Lexer', 'CDATACallback'),
 254              $string
 255          );
 256      }
 257  
 258      /**
 259       * Special CDATA case that is especially convoluted for <script>
 260       * @param string $string HTML string to process.
 261       * @return string HTML with CDATA sections escaped.
 262       */
 263      protected static function escapeCommentedCDATA($string)
 264      {
 265          return preg_replace_callback(
 266              '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
 267              array('HTMLPurifier_Lexer', 'CDATACallback'),
 268              $string
 269          );
 270      }
 271  
 272      /**
 273       * Special Internet Explorer conditional comments should be removed.
 274       * @param string $string HTML string to process.
 275       * @return string HTML with conditional comments removed.
 276       */
 277      protected static function removeIEConditional($string)
 278      {
 279          return preg_replace(
 280              '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
 281              '',
 282              $string
 283          );
 284      }
 285  
 286      /**
 287       * Callback function for escapeCDATA() that does the work.
 288       *
 289       * @warning Though this is public in order to let the callback happen,
 290       *          calling it directly is not recommended.
 291       * @param array $matches PCRE matches array, with index 0 the entire match
 292       *                  and 1 the inside of the CDATA section.
 293       * @return string Escaped internals of the CDATA section.
 294       */
 295      protected static function CDATACallback($matches)
 296      {
 297          // not exactly sure why the character set is needed, but whatever
 298          return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
 299      }
 300  
 301      /**
 302       * Takes a piece of HTML and normalizes it by converting entities, fixing
 303       * encoding, extracting bits, and other good stuff.
 304       * @param string $html HTML.
 305       * @param HTMLPurifier_Config $config
 306       * @param HTMLPurifier_Context $context
 307       * @return string
 308       * @todo Consider making protected
 309       */
 310      public function normalize($html, $config, $context)
 311      {
 312          // normalize newlines to \n
 313          if ($config->get('Core.NormalizeNewlines')) {
 314              $html = str_replace("\r\n", "\n", (string)$html);
 315              $html = str_replace("\r", "\n", (string)$html);
 316          }
 317  
 318          if ($config->get('HTML.Trusted')) {
 319              // escape convoluted CDATA
 320              $html = $this->escapeCommentedCDATA($html);
 321          }
 322  
 323          // escape CDATA
 324          $html = $this->escapeCDATA($html);
 325  
 326          $html = $this->removeIEConditional($html);
 327  
 328          // extract body from document if applicable
 329          if ($config->get('Core.ConvertDocumentToFragment')) {
 330              $e = false;
 331              if ($config->get('Core.CollectErrors')) {
 332                  $e =& $context->get('ErrorCollector');
 333              }
 334              $new_html = $this->extractBody($html);
 335              if ($e && $new_html != $html) {
 336                  $e->send(E_WARNING, 'Lexer: Extracted body');
 337              }
 338              $html = $new_html;
 339          }
 340  
 341          // expand entities that aren't the big five
 342          if ($config->get('Core.LegacyEntityDecoder')) {
 343              $html = $this->_entity_parser->substituteNonSpecialEntities($html);
 344          }
 345  
 346          // clean into wellformed UTF-8 string for an SGML context: this has
 347          // to be done after entity expansion because the entities sometimes
 348          // represent non-SGML characters (horror, horror!)
 349          $html = HTMLPurifier_Encoder::cleanUTF8($html);
 350  
 351          // if processing instructions are to removed, remove them now
 352          if ($config->get('Core.RemoveProcessingInstructions')) {
 353              $html = preg_replace('#<\?.+?\?>#s', '', $html);
 354          }
 355  
 356          $hidden_elements = $config->get('Core.HiddenElements');
 357          if ($config->get('Core.AggressivelyRemoveScript') &&
 358              !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents')
 359              || empty($hidden_elements["script"]))) {
 360              $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html);
 361          }
 362  
 363          return $html;
 364      }
 365  
 366      /**
 367       * Takes a string of HTML (fragment or document) and returns the content
 368       * @todo Consider making protected
 369       */
 370      public function extractBody($html)
 371      {
 372          $matches = array();
 373          $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches);
 374          if ($result) {
 375              // Make sure it's not in a comment
 376              $comment_start = strrpos($matches[1], '<!--');
 377              $comment_end   = strrpos($matches[1], '-->');
 378              if ($comment_start === false ||
 379                  ($comment_end !== false && $comment_end > $comment_start)) {
 380                  return $matches[2];
 381              }
 382          }
 383          return $html;
 384      }
 385  }
 386  
 387  // vim: et sw=4 sts=4