Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 3.10.x will end 8 November 2021 (12 months).
  • Bug fixes for security issues in 3.10.x will end 9 May 2022 (18 months).
  • PHP version: minimum PHP 7.2.0 Note: minimum PHP version has increased since Moodle 3.8. PHP 7.3.x and 7.4.x are supported too.
   1  <?php
   2  
   3  // if want to implement error collecting here, we'll need to use some sort

   4  // of global data (probably trigger_error) because it's impossible to pass

   5  // $config or $context to the callback functions.

   6  
   7  /**

   8   * Handles referencing and derefencing character entities

   9   */
  10  class HTMLPurifier_EntityParser
  11  {
  12  
  13      /**

  14       * Reference to entity lookup table.

  15       * @type HTMLPurifier_EntityLookup

  16       */
  17      protected $_entity_lookup;
  18  
  19      /**

  20       * Callback regex string for entities in text.

  21       * @type string

  22       */
  23      protected $_textEntitiesRegex;
  24  
  25      /**

  26       * Callback regex string for entities in attributes.

  27       * @type string

  28       */
  29      protected $_attrEntitiesRegex;
  30  
  31      /**

  32       * Tests if the beginning of a string is a semi-optional regex

  33       */
  34      protected $_semiOptionalPrefixRegex;
  35  
  36      public function __construct() {
  37          // From

  38          // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon

  39          $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml";
  40  
  41          // NB: three empty captures to put the fourth match in the right

  42          // place

  43          $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/";
  44  
  45          $this->_textEntitiesRegex =
  46              '/&(?:'.
  47              // hex

  48              '[#]x([a-fA-F0-9]+);?|'.
  49              // dec

  50              '[#]0*(\d+);?|'.
  51              // string (mandatory semicolon)

  52              // NB: order matters: match semicolon preferentially

  53              '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
  54              // string (optional semicolon)

  55              "($semi_optional)".
  56              ')/';
  57  
  58          $this->_attrEntitiesRegex =
  59              '/&(?:'.
  60              // hex

  61              '[#]x([a-fA-F0-9]+);?|'.
  62              // dec

  63              '[#]0*(\d+);?|'.
  64              // string (mandatory semicolon)

  65              // NB: order matters: match semicolon preferentially

  66              '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'.
  67              // string (optional semicolon)

  68              // don't match if trailing is equals or alphanumeric (URL

  69              // like)

  70              "($semi_optional)(?![=;A-Za-z0-9])".
  71              ')/';
  72  
  73      }
  74  
  75      /**

  76       * Substitute entities with the parsed equivalents.  Use this on

  77       * textual data in an HTML document (as opposed to attributes.)

  78       *

  79       * @param string $string String to have entities parsed.

  80       * @return string Parsed string.

  81       */
  82      public function substituteTextEntities($string)
  83      {
  84          return preg_replace_callback(
  85              $this->_textEntitiesRegex,
  86              array($this, 'entityCallback'),
  87              $string
  88          );
  89      }
  90  
  91      /**

  92       * Substitute entities with the parsed equivalents.  Use this on

  93       * attribute contents in documents.

  94       *

  95       * @param string $string String to have entities parsed.

  96       * @return string Parsed string.

  97       */
  98      public function substituteAttrEntities($string)
  99      {
 100          return preg_replace_callback(
 101              $this->_attrEntitiesRegex,
 102              array($this, 'entityCallback'),
 103              $string
 104          );
 105      }
 106  
 107      /**

 108       * Callback function for substituteNonSpecialEntities() that does the work.

 109       *

 110       * @param array $matches  PCRE matches array, with 0 the entire match, and

 111       *                  either index 1, 2 or 3 set with a hex value, dec value,

 112       *                  or string (respectively).

 113       * @return string Replacement string.

 114       */
 115  
 116      protected function entityCallback($matches)
 117      {
 118          $entity = $matches[0];
 119          $hex_part = @$matches[1];
 120          $dec_part = @$matches[2];
 121          $named_part = empty($matches[3]) ? (empty($matches[4]) ? "" : $matches[4]) : $matches[3];
 122          if ($hex_part !== NULL && $hex_part !== "") {
 123              return HTMLPurifier_Encoder::unichr(hexdec($hex_part));
 124          } elseif ($dec_part !== NULL && $dec_part !== "") {
 125              return HTMLPurifier_Encoder::unichr((int) $dec_part);
 126          } else {
 127              if (!$this->_entity_lookup) {
 128                  $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
 129              }
 130              if (isset($this->_entity_lookup->table[$named_part])) {
 131                  return $this->_entity_lookup->table[$named_part];
 132              } else {
 133                  // exact match didn't match anything, so test if

 134                  // any of the semicolon optional match the prefix.

 135                  // Test that this is an EXACT match is important to

 136                  // prevent infinite loop

 137                  if (!empty($matches[3])) {
 138                      return preg_replace_callback(
 139                          $this->_semiOptionalPrefixRegex,
 140                          array($this, 'entityCallback'),
 141                          $entity
 142                      );
 143                  }
 144                  return $entity;
 145              }
 146          }
 147      }
 148  
 149      // LEGACY CODE BELOW

 150  
 151      /**

 152       * Callback regex string for parsing entities.

 153       * @type string

 154       */
 155      protected $_substituteEntitiesRegex =
 156          '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
 157          //     1. hex             2. dec      3. string (XML style)

 158  
 159      /**

 160       * Decimal to parsed string conversion table for special entities.

 161       * @type array

 162       */
 163      protected $_special_dec2str =
 164              array(
 165                      34 => '"',
 166                      38 => '&',
 167                      39 => "'",
 168                      60 => '<',
 169                      62 => '>'
 170              );
 171  
 172      /**

 173       * Stripped entity names to decimal conversion table for special entities.

 174       * @type array

 175       */
 176      protected $_special_ent2dec =
 177              array(
 178                      'quot' => 34,
 179                      'amp'  => 38,
 180                      'lt'   => 60,
 181                      'gt'   => 62
 182              );
 183  
 184      /**

 185       * Substitutes non-special entities with their parsed equivalents. Since

 186       * running this whenever you have parsed character is t3h 5uck, we run

 187       * it before everything else.

 188       *

 189       * @param string $string String to have non-special entities parsed.

 190       * @return string Parsed string.

 191       */
 192      public function substituteNonSpecialEntities($string)
 193      {
 194          // it will try to detect missing semicolons, but don't rely on it

 195          return preg_replace_callback(
 196              $this->_substituteEntitiesRegex,
 197              array($this, 'nonSpecialEntityCallback'),
 198              $string
 199          );
 200      }
 201  
 202      /**

 203       * Callback function for substituteNonSpecialEntities() that does the work.

 204       *

 205       * @param array $matches  PCRE matches array, with 0 the entire match, and

 206       *                  either index 1, 2 or 3 set with a hex value, dec value,

 207       *                  or string (respectively).

 208       * @return string Replacement string.

 209       */
 210  
 211      protected function nonSpecialEntityCallback($matches)
 212      {
 213          // replaces all but big five

 214          $entity = $matches[0];
 215          $is_num = (@$matches[0][1] === '#');
 216          if ($is_num) {
 217              $is_hex = (@$entity[2] === 'x');
 218              $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
 219              // abort for special characters

 220              if (isset($this->_special_dec2str[$code])) {
 221                  return $entity;
 222              }
 223              return HTMLPurifier_Encoder::unichr($code);
 224          } else {
 225              if (isset($this->_special_ent2dec[$matches[3]])) {
 226                  return $entity;
 227              }
 228              if (!$this->_entity_lookup) {
 229                  $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
 230              }
 231              if (isset($this->_entity_lookup->table[$matches[3]])) {
 232                  return $this->_entity_lookup->table[$matches[3]];
 233              } else {
 234                  return $entity;
 235              }
 236          }
 237      }
 238  
 239      /**

 240       * Substitutes only special entities with their parsed equivalents.

 241       *

 242       * @notice We try to avoid calling this function because otherwise, it

 243       * would have to be called a lot (for every parsed section).

 244       *

 245       * @param string $string String to have non-special entities parsed.

 246       * @return string Parsed string.

 247       */
 248      public function substituteSpecialEntities($string)
 249      {
 250          return preg_replace_callback(
 251              $this->_substituteEntitiesRegex,
 252              array($this, 'specialEntityCallback'),
 253              $string
 254          );
 255      }
 256  
 257      /**

 258       * Callback function for substituteSpecialEntities() that does the work.

 259       *

 260       * This callback has same syntax as nonSpecialEntityCallback().

 261       *

 262       * @param array $matches  PCRE-style matches array, with 0 the entire match, and

 263       *                  either index 1, 2 or 3 set with a hex value, dec value,

 264       *                  or string (respectively).

 265       * @return string Replacement string.

 266       */
 267      protected function specialEntityCallback($matches)
 268      {
 269          $entity = $matches[0];
 270          $is_num = (@$matches[0][1] === '#');
 271          if ($is_num) {
 272              $is_hex = (@$entity[2] === 'x');
 273              $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
 274              return isset($this->_special_dec2str[$int]) ?
 275                  $this->_special_dec2str[$int] :
 276                  $entity;
 277          } else {
 278              return isset($this->_special_ent2dec[$matches[3]]) ?
 279                  $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] :
 280                  $entity;
 281          }
 282      }
 283  }
 284  
 285  // vim: et sw=4 sts=4