Search moodle.org's
Developer Documentation

See Release Notes
Long Term Support Release

  • Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
  • Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
  • PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.
   1  <?php
   2  
   3  // if want to implement error collecting here, we'll need to use some sort
   4  // of global data (probably trigger_error) because it's impossible to pass
   5  // $config or $context to the callback functions.
   6  
   7  /**
   8   * Handles referencing and derefencing character entities
   9   */
  10  class HTMLPurifier_EntityParser
  11  {
  12  
  13      /**
  14       * Reference to entity lookup table.
  15       * @type HTMLPurifier_EntityLookup
  16       */
  17      protected $_entity_lookup;
  18  
  19      /**
  20       * Callback regex string for entities in text.
  21       * @type string
  22       */
  23      protected $_textEntitiesRegex;
  24  
  25      /**
  26       * Callback regex string for entities in attributes.
  27       * @type string
  28       */
  29      protected $_attrEntitiesRegex;
  30  
  31      /**
  32       * Tests if the beginning of a string is a semi-optional regex
  33       */
  34      protected $_semiOptionalPrefixRegex;
  35  
  36      public function __construct() {
  37          // From
  38          // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon
  39          $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
  40  
  41          // NB: three empty captures to put the fourth match in the right
  42          // place
  43          $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
  44  
  45          $this->_textEntitiesRegex =
  46              '/&(?:'.
  47              // hex
  48              '[#]x([a-fA-F0-9]+);?|'.
  49              // dec
  50              '[#]0*(\d+);?|'.
  51              // string (mandatory semicolon)
  52              // NB: order matters: match semicolon preferentially
  53              '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
  54              // string (optional semicolon)
  55              "($semi_optional)".
  56              ')/';
  57  
  58          $this->_attrEntitiesRegex =
  59              '/&(?:'.
  60              // hex
  61              '[#]x([a-fA-F0-9]+);?|'.
  62              // dec
  63              '[#]0*(\d+);?|'.
  64              // string (mandatory semicolon)
  65              // NB: order matters: match semicolon preferentially
  66              '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
  67              // string (optional semicolon)
  68              // don't match if trailing is equals or alphanumeric (URL
  69              // like)
  70              "($semi_optional)(?![=;A-Za-z0-9])".
  71              ')/';
  72  
  73      }
  74  
  75      /**
  76       * Substitute entities with the parsed equivalents.  Use this on
  77       * textual data in an HTML document (as opposed to attributes.)
  78       *
  79       * @param string $string String to have entities parsed.
  80       * @return string Parsed string.
  81       */
  82      public function substituteTextEntities($string)
  83      {
  84          return preg_replace_callback(
  85              $this->_textEntitiesRegex,
  86              array($this, 'entityCallback'),
  87              $string
  88          );
  89      }
  90  
  91      /**
  92       * Substitute entities with the parsed equivalents.  Use this on
  93       * attribute contents in documents.
  94       *
  95       * @param string $string String to have entities parsed.
  96       * @return string Parsed string.
  97       */
  98      public function substituteAttrEntities($string)
  99      {
 100          return preg_replace_callback(
 101              $this->_attrEntitiesRegex,
 102              array($this, 'entityCallback'),
 103              $string
 104          );
 105      }
 106  
 107      /**
 108       * Callback function for substituteNonSpecialEntities() that does the work.
 109       *
 110       * @param array $matches  PCRE matches array, with 0 the entire match, and
 111       *                  either index 1, 2 or 3 set with a hex value, dec value,
 112       *                  or string (respectively).
 113       * @return string Replacement string.
 114       */
 115  
 116      protected function entityCallback($matches)
 117      {
 118          $entity = $matches[0];
 119          $hex_part = @$matches[1];
 120          $dec_part = @$matches[2];
 121          $named_part = empty($matches[3]) ? (empty($matches[4]) ? "" : $matches[4]) : $matches[3];
 122          if ($hex_part !== NULL && $hex_part !== "") {
 123              return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
 124          } elseif ($dec_part !== NULL && $dec_part !== "") {
 125              return HTMLPurifier_Encoder::unichr((int) $dec_part);
 126          } else {
 127              if (!$this->_entity_lookup) {
 128                  $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
 129              }
 130              if (isset($this->_entity_lookup->table[$named_part])) {
 131                  return $this->_entity_lookup->table[$named_part];
 132              } else {
 133                  // exact match didn't match anything, so test if
 134                  // any of the semicolon optional match the prefix.
 135                  // Test that this is an EXACT match is important to
 136                  // prevent infinite loop
 137                  if (!empty($matches[3])) {
 138                      return preg_replace_callback(
 139                          $this->_semiOptionalPrefixRegex,
 140                          array($this, 'entityCallback'),
 141                          $entity
 142                      );
 143                  }
 144                  return $entity;
 145              }
 146          }
 147      }
 148  
 149      // LEGACY CODE BELOW
 150  
 151      /**
 152       * Callback regex string for parsing entities.
 153       * @type string
 154       */
 155      protected $_substituteEntitiesRegex =
 156          '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
 157          //     1. hex             2. dec      3. string (XML style)
 158  
 159      /**
 160       * Decimal to parsed string conversion table for special entities.
 161       * @type array
 162       */
 163      protected $_special_dec2str =
 164              array(
 165                      34 => '"',
 166                      38 => '&',
 167                      39 => "'",
 168                      60 => '<',
 169                      62 => '>'
 170              );
 171  
 172      /**
 173       * Stripped entity names to decimal conversion table for special entities.
 174       * @type array
 175       */
 176      protected $_special_ent2dec =
 177              array(
 178                      'quot' => 34,
 179                      'amp'  => 38,
 180                      'lt'   => 60,
 181                      'gt'   => 62
 182              );
 183  
 184      /**
 185       * Substitutes non-special entities with their parsed equivalents. Since
 186       * running this whenever you have parsed character is t3h 5uck, we run
 187       * it before everything else.
 188       *
 189       * @param string $string String to have non-special entities parsed.
 190       * @return string Parsed string.
 191       */
 192      public function substituteNonSpecialEntities($string)
 193      {
 194          // it will try to detect missing semicolons, but don't rely on it
 195          return preg_replace_callback(
 196              $this->_substituteEntitiesRegex,
 197              array($this, 'nonSpecialEntityCallback'),
 198              $string
 199          );
 200      }
 201  
 202      /**
 203       * Callback function for substituteNonSpecialEntities() that does the work.
 204       *
 205       * @param array $matches  PCRE matches array, with 0 the entire match, and
 206       *                  either index 1, 2 or 3 set with a hex value, dec value,
 207       *                  or string (respectively).
 208       * @return string Replacement string.
 209       */
 210  
 211      protected function nonSpecialEntityCallback($matches)
 212      {
 213          // replaces all but big five
 214          $entity = $matches[0];
 215          $is_num = (@$matches[0][1] === '#');
 216          if ($is_num) {
 217              $is_hex = (@$entity[2] === 'x');
 218              $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
 219              // abort for special characters
 220              if (isset($this->_special_dec2str[$code])) {
 221                  return $entity;
 222              }
 223              return HTMLPurifier_Encoder::unichr($code);
 224          } else {
 225              if (isset($this->_special_ent2dec[$matches[3]])) {
 226                  return $entity;
 227              }
 228              if (!$this->_entity_lookup) {
 229                  $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
 230              }
 231              if (isset($this->_entity_lookup->table[$matches[3]])) {
 232                  return $this->_entity_lookup->table[$matches[3]];
 233              } else {
 234                  return $entity;
 235              }
 236          }
 237      }
 238  
 239      /**
 240       * Substitutes only special entities with their parsed equivalents.
 241       *
 242       * @notice We try to avoid calling this function because otherwise, it
 243       * would have to be called a lot (for every parsed section).
 244       *
 245       * @param string $string String to have non-special entities parsed.
 246       * @return string Parsed string.
 247       */
 248      public function substituteSpecialEntities($string)
 249      {
 250          return preg_replace_callback(
 251              $this->_substituteEntitiesRegex,
 252              array($this, 'specialEntityCallback'),
 253              $string
 254          );
 255      }
 256  
 257      /**
 258       * Callback function for substituteSpecialEntities() that does the work.
 259       *
 260       * This callback has same syntax as nonSpecialEntityCallback().
 261       *
 262       * @param array $matches  PCRE-style matches array, with 0 the entire match, and
 263       *                  either index 1, 2 or 3 set with a hex value, dec value,
 264       *                  or string (respectively).
 265       * @return string Replacement string.
 266       */
 267      protected function specialEntityCallback($matches)
 268      {
 269          $entity = $matches[0];
 270          $is_num = (@$matches[0][1] === '#');
 271          if ($is_num) {
 272              $is_hex = (@$entity[2] === 'x');
 273              $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
 274              return isset($this->_special_dec2str[$int]) ?
 275                  $this->_special_dec2str[$int] :
 276                  $entity;
 277          } else {
 278              return isset($this->_special_ent2dec[$matches[3]]) ?
 279                  $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
 280                  $entity;
 281          }
 282      }
 283  }
 284  
 285  // vim: et sw=4 sts=4