Moodle 4.1 XRef and Diffs

Search moodle.org's
Developer Documentation
See Release Notes
Long Term Support Release
Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.
Moodle 4.1 Database Schema (by Marcus Green)
/lib/htmlpurifier/HTMLPurifier/Lexer/ -> DirectLex.php (source)
   1  <?php
   2  
   3  /**
   4   * Our in-house implementation of a parser.
   5   *
   6   * A pure PHP parser, DirectLex has absolutely no dependencies, making
   7   * it a reasonably good default for PHP4.  Written with efficiency in mind,
   8   * it can be four times faster than HTMLPurifier_Lexer_PEARSax3, although it
   9   * pales in comparison to HTMLPurifier_Lexer_DOMLex.
  10   *
  11   * @todo Reread XML spec and document differences.
  12   */
  13  class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
  14  {
  15      /**
  16       * @type bool
  17       */
  18      public $tracksLineNumbers = true;
  19  
  20      /**
  21       * Whitespace characters for str(c)spn.
  22       * @type string
  23       */
  24      protected $_whitespace = "\x20\x09\x0D\x0A";
  25  
  26      /**
  27       * Callback function for script CDATA fudge
  28       * @param array $matches, in form of array(opening tag, contents, closing tag)
  29       * @return string
  30       */
  31      protected function scriptCallback($matches)
  32      {
  33          return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
  34      }
  35  
  36      /**
  37       * @param String $html
  38       * @param HTMLPurifier_Config $config
  39       * @param HTMLPurifier_Context $context
  40       * @return array|HTMLPurifier_Token[]
  41       */
  42      public function tokenizeHTML($html, $config, $context)
  43      {
  44          // special normalization for script tags without any armor
  45          // our "armor" heurstic is a < sign any number of whitespaces after
  46          // the first script tag
  47          if ($config->get('HTML.Trusted')) {
  48              $html = preg_replace_callback(
  49                  '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
  50                  array($this, 'scriptCallback'),
  51                  $html
  52              );
  53          }
  54  
  55          $html = $this->normalize($html, $config, $context);
  56  
  57          $cursor = 0; // our location in the text
  58          $inside_tag = false; // whether or not we're parsing the inside of a tag
  59          $array = array(); // result array
  60  
  61          // This is also treated to mean maintain *column* numbers too
  62          $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
  63  
  64          if ($maintain_line_numbers === null) {
  65              // automatically determine line numbering by checking
  66              // if error collection is on
  67              $maintain_line_numbers = $config->get('Core.CollectErrors');
  68          }
  69  
  70          if ($maintain_line_numbers) {
  71              $current_line = 1;
  72              $current_col = 0;
  73              $length = strlen($html);
  74          } else {
  75              $current_line = false;
  76              $current_col = false;
  77              $length = false;
  78          }
  79          $context->register('CurrentLine', $current_line);
  80          $context->register('CurrentCol', $current_col);
  81          $nl = "\n";
  82          // how often to manually recalculate. This will ALWAYS be right,
  83          // but it's pretty wasteful. Set to 0 to turn off
  84          $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
  85  
  86          $e = false;
  87          if ($config->get('Core.CollectErrors')) {
  88              $e =& $context->get('ErrorCollector');
  89          }
  90  
  91          // for testing synchronization
  92          $loops = 0;
  93  
  94          while (++$loops) {
  95              // $cursor is either at the start of a token, or inside of
  96              // a tag (i.e. there was a < immediately before it), as indicated
  97              // by $inside_tag
  98  
  99              if ($maintain_line_numbers) {
 100                  // $rcursor, however, is always at the start of a token.
 101                  $rcursor = $cursor - (int)$inside_tag;
 102  
 103                  // Column number is cheap, so we calculate it every round.
 104                  // We're interested at the *end* of the newline string, so
 105                  // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
 106                  // from our "rcursor" position.
 107                  $nl_pos = strrpos($html, $nl, $rcursor - $length);
 108                  $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
 109  
 110                  // recalculate lines
 111                  if ($synchronize_interval && // synchronization is on
 112                      $cursor > 0 && // cursor is further than zero
 113                      $loops % $synchronize_interval === 0) { // time to synchronize!
 114                      $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
 115                  }
 116              }
 117  
 118              $position_next_lt = strpos($html, '<', $cursor);
 119              $position_next_gt = strpos($html, '>', $cursor);
 120  
 121              // triggers on "<b>asdf</b>" but not "asdf <b></b>"
 122              // special case to set up context
 123              if ($position_next_lt === $cursor) {
 124                  $inside_tag = true;
 125                  $cursor++;
 126              }
 127  
 128              if (!$inside_tag && $position_next_lt !== false) {
 129                  // We are not inside tag and there still is another tag to parse
 130                  $token = new
 131                  HTMLPurifier_Token_Text(
 132                      $this->parseText(
 133                          substr(
 134                              $html,
 135                              $cursor,
 136                              $position_next_lt - $cursor
 137                          ), $config
 138                      )
 139                  );
 140                  if ($maintain_line_numbers) {
 141                      $token->rawPosition($current_line, $current_col);
 142                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
 143                  }
 144                  $array[] = $token;
 145                  $cursor = $position_next_lt + 1;
 146                  $inside_tag = true;
 147                  continue;
 148              } elseif (!$inside_tag) {
 149                  // We are not inside tag but there are no more tags
 150                  // If we're already at the end, break
 151                  if ($cursor === strlen($html)) {
 152                      break;
 153                  }
 154                  // Create Text of rest of string
 155                  $token = new
 156                  HTMLPurifier_Token_Text(
 157                      $this->parseText(
 158                          substr(
 159                              $html,
 160                              $cursor
 161                          ), $config
 162                      )
 163                  );
 164                  if ($maintain_line_numbers) {
 165                      $token->rawPosition($current_line, $current_col);
 166                  }
 167                  $array[] = $token;
 168                  break;
 169              } elseif ($inside_tag && $position_next_gt !== false) {
 170                  // We are in tag and it is well formed
 171                  // Grab the internals of the tag
 172                  $strlen_segment = $position_next_gt - $cursor;
 173  
 174                  if ($strlen_segment < 1) {
 175                      // there's nothing to process!
 176                      $token = new HTMLPurifier_Token_Text('<');
 177                      $cursor++;
 178                      continue;
 179                  }
 180  
 181                  $segment = substr($html, $cursor, $strlen_segment);
 182  
 183                  if ($segment === false) {
 184                      // somehow, we attempted to access beyond the end of
 185                      // the string, defense-in-depth, reported by Nate Abele
 186                      break;
 187                  }
 188  
 189                  // Check if it's a comment
 190                  if (substr($segment, 0, 3) === '!--') {
 191                      // re-determine segment length, looking for -->
 192                      $position_comment_end = strpos($html, '-->', $cursor);
 193                      if ($position_comment_end === false) {
 194                          // uh oh, we have a comment that extends to
 195                          // infinity. Can't be helped: set comment
 196                          // end position to end of string
 197                          if ($e) {
 198                              $e->send(E_WARNING, 'Lexer: Unclosed comment');
 199                          }
 200                          $position_comment_end = strlen($html);
 201                          $end = true;
 202                      } else {
 203                          $end = false;
 204                      }
 205                      $strlen_segment = $position_comment_end - $cursor;
 206                      $segment = substr($html, $cursor, $strlen_segment);
 207                      $token = new
 208                      HTMLPurifier_Token_Comment(
 209                          substr(
 210                              $segment,
 211                              3,
 212                              $strlen_segment - 3
 213                          )
 214                      );
 215                      if ($maintain_line_numbers) {
 216                          $token->rawPosition($current_line, $current_col);
 217                          $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
 218                      }
 219                      $array[] = $token;
 220                      $cursor = $end ? $position_comment_end : $position_comment_end + 3;
 221                      $inside_tag = false;
 222                      continue;
 223                  }
 224  
 225                  // Check if it's an end tag
 226                  $is_end_tag = (strpos($segment, '/') === 0);
 227                  if ($is_end_tag) {
 228                      $type = substr($segment, 1);
 229                      $token = new HTMLPurifier_Token_End($type);
 230                      if ($maintain_line_numbers) {
 231                          $token->rawPosition($current_line, $current_col);
 232                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 233                      }
 234                      $array[] = $token;
 235                      $inside_tag = false;
 236                      $cursor = $position_next_gt + 1;
 237                      continue;
 238                  }
 239  
 240                  // Check leading character is alnum, if not, we may
 241                  // have accidently grabbed an emoticon. Translate into
 242                  // text and go our merry way
 243                  if (!ctype_alpha($segment[0])) {
 244                      // XML:  $segment[0] !== '_' && $segment[0] !== ':'
 245                      if ($e) {
 246                          $e->send(E_NOTICE, 'Lexer: Unescaped lt');
 247                      }
 248                      $token = new HTMLPurifier_Token_Text('<');
 249                      if ($maintain_line_numbers) {
 250                          $token->rawPosition($current_line, $current_col);
 251                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 252                      }
 253                      $array[] = $token;
 254                      $inside_tag = false;
 255                      continue;
 256                  }
 257  
 258                  // Check if it is explicitly self closing, if so, remove
 259                  // trailing slash. Remember, we could have a tag like <br>, so
 260                  // any later token processing scripts must convert improperly
 261                  // classified EmptyTags from StartTags.
 262                  $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
 263                  if ($is_self_closing) {
 264                      $strlen_segment--;
 265                      $segment = substr($segment, 0, $strlen_segment);
 266                  }
 267  
 268                  // Check if there are any attributes
 269                  $position_first_space = strcspn($segment, $this->_whitespace);
 270  
 271                  if ($position_first_space >= $strlen_segment) {
 272                      if ($is_self_closing) {
 273                          $token = new HTMLPurifier_Token_Empty($segment);
 274                      } else {
 275                          $token = new HTMLPurifier_Token_Start($segment);
 276                      }
 277                      if ($maintain_line_numbers) {
 278                          $token->rawPosition($current_line, $current_col);
 279                          $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 280                      }
 281                      $array[] = $token;
 282                      $inside_tag = false;
 283                      $cursor = $position_next_gt + 1;
 284                      continue;
 285                  }
 286  
 287                  // Grab out all the data
 288                  $type = substr($segment, 0, $position_first_space);
 289                  $attribute_string =
 290                      trim(
 291                          substr(
 292                              $segment,
 293                              $position_first_space
 294                          )
 295                      );
 296                  if ($attribute_string) {
 297                      $attr = $this->parseAttributeString(
 298                          $attribute_string,
 299                          $config,
 300                          $context
 301                      );
 302                  } else {
 303                      $attr = array();
 304                  }
 305  
 306                  if ($is_self_closing) {
 307                      $token = new HTMLPurifier_Token_Empty($type, $attr);
 308                  } else {
 309                      $token = new HTMLPurifier_Token_Start($type, $attr);
 310                  }
 311                  if ($maintain_line_numbers) {
 312                      $token->rawPosition($current_line, $current_col);
 313                      $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
 314                  }
 315                  $array[] = $token;
 316                  $cursor = $position_next_gt + 1;
 317                  $inside_tag = false;
 318                  continue;
 319              } else {
 320                  // inside tag, but there's no ending > sign
 321                  if ($e) {
 322                      $e->send(E_WARNING, 'Lexer: Missing gt');
 323                  }
 324                  $token = new
 325                  HTMLPurifier_Token_Text(
 326                      '<' .
 327                      $this->parseText(
 328                          substr($html, $cursor), $config
 329                      )
 330                  );
 331                  if ($maintain_line_numbers) {
 332                      $token->rawPosition($current_line, $current_col);
 333                  }
 334                  // no cursor scroll? Hmm...
 335                  $array[] = $token;
 336                  break;
 337              }
 338              break;
 339          }
 340  
 341          $context->destroy('CurrentLine');
 342          $context->destroy('CurrentCol');
 343          return $array;
 344      }
 345  
 346      /**
 347       * PHP 5.0.x compatible substr_count that implements offset and length
 348       * @param string $haystack
 349       * @param string $needle
 350       * @param int $offset
 351       * @param int $length
 352       * @return int
 353       */
 354      protected function substrCount($haystack, $needle, $offset, $length)
 355      {
 356          static $oldVersion;
 357          if ($oldVersion === null) {
 358              $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
 359          }
 360          if ($oldVersion) {
 361              $haystack = substr($haystack, $offset, $length);
 362              return substr_count($haystack, $needle);
 363          } else {
 364              return substr_count($haystack, $needle, $offset, $length);
 365          }
 366      }
 367  
 368      /**
 369       * Takes the inside of an HTML tag and makes an assoc array of attributes.
 370       *
 371       * @param string $string Inside of tag excluding name.
 372       * @param HTMLPurifier_Config $config
 373       * @param HTMLPurifier_Context $context
 374       * @return array Assoc array of attributes.
 375       */
 376      public function parseAttributeString($string, $config, $context)
 377      {
 378          $string = (string)$string; // quick typecast
 379  
 380          if ($string == '') {
 381              return array();
 382          } // no attributes
 383  
 384          $e = false;
 385          if ($config->get('Core.CollectErrors')) {
 386              $e =& $context->get('ErrorCollector');
 387          }
 388  
 389          // let's see if we can abort as quickly as possible
 390          // one equal sign, no spaces => one attribute
 391          $num_equal = substr_count($string, '=');
 392          $has_space = strpos($string, ' ');
 393          if ($num_equal === 0 && !$has_space) {
 394              // bool attribute
 395              return array($string => $string);
 396          } elseif ($num_equal === 1 && !$has_space) {
 397              // only one attribute
 398              list($key, $quoted_value) = explode('=', $string);
 399              $quoted_value = trim($quoted_value);
 400              if (!$key) {
 401                  if ($e) {
 402                      $e->send(E_ERROR, 'Lexer: Missing attribute key');
 403                  }
 404                  return array();
 405              }
 406              if (!$quoted_value) {
 407                  return array($key => '');
 408              }
 409              $first_char = @$quoted_value[0];
 410              $last_char = @$quoted_value[strlen($quoted_value) - 1];
 411  
 412              $same_quote = ($first_char == $last_char);
 413              $open_quote = ($first_char == '"' || $first_char == "'");
 414  
 415              if ($same_quote && $open_quote) {
 416                  // well behaved
 417                  $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
 418              } else {
 419                  // not well behaved
 420                  if ($open_quote) {
 421                      if ($e) {
 422                          $e->send(E_ERROR, 'Lexer: Missing end quote');
 423                      }
 424                      $value = substr($quoted_value, 1);
 425                  } else {
 426                      $value = $quoted_value;
 427                  }
 428              }
 429              if ($value === false) {
 430                  $value = '';
 431              }
 432              return array($key => $this->parseAttr($value, $config));
 433          }
 434  
 435          // setup loop environment
 436          $array = array(); // return assoc array of attributes
 437          $cursor = 0; // current position in string (moves forward)
 438          $size = strlen($string); // size of the string (stays the same)
 439  
 440          // if we have unquoted attributes, the parser expects a terminating
 441          // space, so let's guarantee that there's always a terminating space.
 442          $string .= ' ';
 443  
 444          $old_cursor = -1;
 445          while ($cursor < $size) {
 446              if ($old_cursor >= $cursor) {
 447                  throw new Exception("Infinite loop detected");
 448              }
 449              $old_cursor = $cursor;
 450  
 451              $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
 452              // grab the key
 453  
 454              $key_begin = $cursor; //we're currently at the start of the key
 455  
 456              // scroll past all characters that are the key (not whitespace or =)
 457              $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
 458  
 459              $key_end = $cursor; // now at the end of the key
 460  
 461              $key = substr($string, $key_begin, $key_end - $key_begin);
 462  
 463              if (!$key) {
 464                  if ($e) {
 465                      $e->send(E_ERROR, 'Lexer: Missing attribute key');
 466                  }
 467                  $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
 468                  continue; // empty key
 469              }
 470  
 471              // scroll past all whitespace
 472              $cursor += strspn($string, $this->_whitespace, $cursor);
 473  
 474              if ($cursor >= $size) {
 475                  $array[$key] = $key;
 476                  break;
 477              }
 478  
 479              // if the next character is an equal sign, we've got a regular
 480              // pair, otherwise, it's a bool attribute
 481              $first_char = @$string[$cursor];
 482  
 483              if ($first_char == '=') {
 484                  // key="value"
 485  
 486                  $cursor++;
 487                  $cursor += strspn($string, $this->_whitespace, $cursor);
 488  
 489                  if ($cursor === false) {
 490                      $array[$key] = '';
 491                      break;
 492                  }
 493  
 494                  // we might be in front of a quote right now
 495  
 496                  $char = @$string[$cursor];
 497  
 498                  if ($char == '"' || $char == "'") {
 499                      // it's quoted, end bound is $char
 500                      $cursor++;
 501                      $value_begin = $cursor;
 502                      $cursor = strpos($string, $char, $cursor);
 503                      $value_end = $cursor;
 504                  } else {
 505                      // it's not quoted, end bound is whitespace
 506                      $value_begin = $cursor;
 507                      $cursor += strcspn($string, $this->_whitespace, $cursor);
 508                      $value_end = $cursor;
 509                  }
 510  
 511                  // we reached a premature end
 512                  if ($cursor === false) {
 513                      $cursor = $size;
 514                      $value_end = $cursor;
 515                  }
 516  
 517                  $value = substr($string, $value_begin, $value_end - $value_begin);
 518                  if ($value === false) {
 519                      $value = '';
 520                  }
 521                  $array[$key] = $this->parseAttr($value, $config);
 522                  $cursor++;
 523              } else {
 524                  // boolattr
 525                  if ($key !== '') {
 526                      $array[$key] = $key;
 527                  } else {
 528                      // purely theoretical
 529                      if ($e) {
 530                          $e->send(E_ERROR, 'Lexer: Missing attribute key');
 531                      }
 532                  }
 533              }
 534          }
 535          return $array;
 536      }
 537  }
 538  
 539  // vim: et sw=4 sts=4