Moodle 4.1 XRef and Diffs

Search moodle.org's
Developer Documentation
See Release Notes
Long Term Support Release
Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.
Moodle 4.1 Database Schema (by Marcus Green)
/lib/html2text/ -> Html2Text.php (source)
Differences Between: [Versions 310 and 401] [Versions 39 and 401]
   1  <?php
   2  
   3  /*
   4   * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
   5   *
   6   * This script is free software; you can redistribute it and/or modify
   7   * it under the terms of the GNU General Public License as published by
   8   * the Free Software Foundation; either version 2 of the License, or
   9   * (at your option) any later version.
  10   *
  11   * The GNU General Public License can be found at
  12   * http://www.gnu.org/copyleft/gpl.html.
  13   *
  14   * This script is distributed in the hope that it will be useful,
  15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17   * GNU General Public License for more details.
  18   */
  19  
  20  namespace Html2Text;
  21  
  22  class Html2Text
  23  {
  24      const ENCODING = 'UTF-8';
  25  
  26      protected $htmlFuncFlags;
  27  
  28      /**
  29       * Contains the HTML content to convert.
  30       *
  31       * @var string $html
  32       */
  33      protected $html;
  34  
  35      /**
  36       * Contains the converted, formatted text.
  37       *
  38       * @var string $text
  39       */
  40      protected $text;
  41  
  42      /**
  43       * List of preg* regular expression patterns to search for,
  44       * used in conjunction with $replace.
  45       *
  46       * @var array $search
  47       * @see $replace
  48       */
  49      protected $search = array(
  50          "/\r/",                                           // Non-legal carriage return
  51          "/[\n\t]+/",                                      // Newlines and tabs
  52          '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
  53          '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
  54          '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
  55          '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
  56          '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
  57          '/<ins\b[^>]*>(.*?)<\/ins>/i',                    // <ins>
  58          '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
  59          '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
  60          '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
  61          '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
  62          '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
  63          '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
  64          '/<li\b[^>]*>/i',                                 // <li>
  65          '/<hr\b[^>]*>/i',                                 // <hr>
  66          '/<div\b[^>]*>/i',                                // <div>
  67          '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
  68          '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
  69          '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
  70          '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
  71          '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
  72      );
  73  
  74      /**
  75       * List of pattern replacements corresponding to patterns searched.
  76       *
  77       * @var array $replace
  78       * @see $search
  79       */
  80      protected $replace = array(
  81          '',                              // Non-legal carriage return
  82          ' ',                             // Newlines and tabs
  83          '',                              // <head>
  84          '',                              // <script>s -- which strip_tags supposedly has problems with
  85          '',                              // <style>s -- which strip_tags supposedly has problems with
  86          '_\\1_',                         // <i>
  87          '_\\1_',                         // <em>
  88          '_\\1_',                         // <ins>
  89          "\n\n",                          // <ul> and </ul>
  90          "\n\n",                          // <ol> and </ol>
  91          "\n\n",                          // <dl> and </dl>
  92          "\t* \\1\n",                     // <li> and </li>
  93          " \\1\n",                        // <dd> and </dd>
  94          "\t* \\1",                       // <dt> and </dt>
  95          "\n\t* ",                        // <li>
  96          "\n-------------------------\n", // <hr>
  97          "<div>\n",                       // <div>
  98          "\n\n",                          // <table> and </table>
  99          "\n",                            // <tr> and </tr>
 100          "\t\t\\1\n",                     // <td> and </td>
 101          "",                              // <span class="_html2text_ignore">...</span>
 102          '[\\2]',                         // <img> with alt tag
 103      );
 104  
 105      /**
 106       * List of preg* regular expression patterns to search for,
 107       * used in conjunction with $entReplace.
 108       *
 109       * @var array $entSearch
 110       * @see $entReplace
 111       */
 112      protected $entSearch = array(
 113          '/&#153;/i',                                     // TM symbol in win-1252
 114          '/&#151;/i',                                     // m-dash in win-1252
 115          '/&(amp|#38);/i',                                // Ampersand: see converter()
 116          '/[ ]{2,}/',                                     // Runs of spaces, post-handling
 117          '/&#39;/i',                                      // The apostrophe symbol
 118      );
 119  
 120      /**
 121       * List of pattern replacements corresponding to patterns searched.
 122       *
 123       * @var array $entReplace
 124       * @see $entSearch
 125       */
 126      protected $entReplace = array(
 127          '™',         // TM symbol
 128          '—',         // m-dash
 129          '|+|amp|+|', // Ampersand: see converter()
 130          ' ',         // Runs of spaces, post-handling
 131          '\'',        // Apostrophe
 132      );
 133  
 134      /**
 135       * List of preg* regular expression patterns to search for
 136       * and replace using callback function.
 137       *
 138       * @var array $callbackSearch
 139       */
 140      protected $callbackSearch = array(
 141          '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
 142          '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
 143          '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
 144          '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
 145          '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
 146          '/<(del)( [^>]*)?>(.*?)<\/del>/i',                       // <del>
 147          '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
 148          '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
 149      );
 150  
 151      /**
 152       * List of preg* regular expression patterns to search for in PRE body,
 153       * used in conjunction with $preReplace.
 154       *
 155       * @var array $preSearch
 156       * @see $preReplace
 157       */
 158      protected $preSearch = array(
 159          "/\n/",
 160          "/\t/",
 161          '/ /',
 162          '/<pre[^>]*>/',
 163          '/<\/pre>/'
 164      );
 165  
 166      /**
 167       * List of pattern replacements corresponding to patterns searched for PRE body.
 168       *
 169       * @var array $preReplace
 170       * @see $preSearch
 171       */
 172      protected $preReplace = array(
 173          '<br>',
 174          '&nbsp;&nbsp;&nbsp;&nbsp;',
 175          '&nbsp;',
 176          '',
 177          '',
 178      );
 179  
 180      /**
 181       * Temporary workspace used during PRE processing.
 182       *
 183       * @var string $preContent
 184       */
 185      protected $preContent = '';
 186  
 187      /**
 188       * Contains the base URL that relative links should resolve to.
 189       *
 190       * @var string $baseurl
 191       */
 192      protected $baseurl = '';
 193  
 194      /**
 195       * Indicates whether content in the $html variable has been converted yet.
 196       *
 197       * @var boolean $converted
 198       * @see $html, $text
 199       */
 200      protected $converted = false;
 201  
 202      /**
 203       * Contains URL addresses from links to be rendered in plain text.
 204       *
 205       * @var array $linkList
 206       * @see buildlinkList()
 207       */
 208      protected $linkList = array();
 209  
 210      /**
 211       * Various configuration options (able to be set in the constructor)
 212       *
 213       * @var array $options
 214       */
 215      protected $options = array(
 216          'do_links' => 'inline', // 'none'
 217                                  // 'inline' (show links inline)
 218                                  // 'nextline' (show links on the next line)
 219                                  // 'table' (if a table of link URLs should be listed after the text.
 220                                  // 'bbcode' (show links as bbcode)
 221  
 222          'width' => 70,          //  Maximum width of the formatted text, in columns.
 223                                  //  Set this value to 0 (or less) to ignore word wrapping
 224                                  //  and not constrain text to a fixed-width column.
 225      );
 226  
 227      private function legacyConstruct($html = '', $fromFile = false, array $options = array())
 228      {
 229          $this->set_html($html, $fromFile);
 230          $this->options = array_merge($this->options, $options);
 231      }
 232  
 233      /**
 234       * @param string $html    Source HTML
 235       * @param array  $options Set configuration options
 236       */
 237      public function __construct($html = '', $options = array())
 238      {
 239          // for backwards compatibility
 240          if (!is_array($options)) {
 241              return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
 242          }
 243  
 244          $this->html = $html;
 245          $this->options = array_merge($this->options, $options);
 246          $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
 247              ? ENT_COMPAT
 248              : ENT_COMPAT | ENT_HTML5;
 249      }
 250  
 251      /**
 252      * Get the source HTML
 253      *
 254      * @return string
 255      */
 256      public function getHtml()
 257      {
 258          return $this->html;
 259      }
 260  
 261      /**
 262       * Set the source HTML
 263       *
 264       * @param string $html HTML source content
 265       */
 266      public function setHtml($html)
 267      {
 268          $this->html = $html;
 269          $this->converted = false;
 270      }
 271  
 272      /**
 273       * @deprecated
 274       */
 275      public function set_html($html, $from_file = false)
 276      {
 277          if ($from_file) {
 278              throw new \InvalidArgumentException("Argument from_file no longer supported");
 279          }
 280  
 281          return $this->setHtml($html);
 282      }
 283  
 284      /**
 285       * Returns the text, converted from HTML.
 286       *
 287       * @return string Plain text
 288       */
 289      public function getText()
 290      {
 291          if (!$this->converted) {
 292              $this->convert();
 293          }
 294  
 295          return $this->text;
 296      }
 297  
 298      /**
 299       * @deprecated
 300       */
 301      public function get_text()
 302      {
 303          return $this->getText();
 304      }
 305  
 306      /**
 307       * @deprecated
 308       */
 309      public function print_text()
 310      {
 311          print $this->getText();
 312      }
 313  
 314      /**
 315       * @deprecated
 316       */
 317      public function p()
 318      {
 319          return $this->print_text();
 320      }
 321  
 322      /**
 323       * Sets a base URL to handle relative links.
 324       *
 325       * @param string $baseurl
 326       */
 327      public function setBaseUrl($baseurl)
 328      {
 329          $this->baseurl = $baseurl;
 330      }
 331  
 332      /**
 333       * @deprecated
 334       */
 335      public function set_base_url($baseurl)
 336      {
 337          return $this->setBaseUrl($baseurl);
 338      }
 339  
 340      protected function convert()
 341      {
 342         $origEncoding = mb_internal_encoding();
 343         mb_internal_encoding(self::ENCODING);
 344  
 345         $this->doConvert();
 346  
 347         mb_internal_encoding($origEncoding);
 348      }
 349  
 350      protected function doConvert()
 351      {
 352          $this->linkList = array();
 353  
 354          $text = trim($this->html);
 355  
 356          $this->converter($text);
 357  
 358          if ($this->linkList) {
 359              $text .= "\n\nLinks:\n------\n";
 360              foreach ($this->linkList as $i => $url) {
 361                  $text .= '[' . ($i + 1) . '] ' . $url . "\n";
 362              }
 363          }
 364  
 365          $this->text = $text;
 366  
 367          $this->converted = true;
 368      }
 369  
 370      protected function converter(&$text)
 371      {
 372          $this->convertBlockquotes($text);
 373          $this->convertPre($text);
 374          $text = preg_replace($this->search, $this->replace, $text);
 375          $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
 376          $text = strip_tags($text);
 377          $text = preg_replace($this->entSearch, $this->entReplace, $text);
 378          $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
 379  
 380          // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 381          $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 382  
 383          // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 384          // This properly handles situation of "&amp;quot;" in input string
 385          $text = str_replace('|+|amp|+|', '&', $text);
 386  
 387          // Normalise empty lines
 388          $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 389          $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 390  
 391          // remove leading empty lines (can be produced by eg. P tag on the beginning)
 392          $text = ltrim($text, "\n");
 393  
 394          if ($this->options['width'] > 0) {
 395              $text = wordwrap($text, $this->options['width']);
 396          }
 397      }
 398  
 399      /**
 400       * Helper function called by preg_replace() on link replacement.
 401       *
 402       * Maintains an internal list of links to be displayed at the end of the
 403       * text, with numeric indices to the original point in the text they
 404       * appeared. Also makes an effort at identifying and handling absolute
 405       * and relative links.
 406       *
 407       * @param  string $link          URL of the link
 408       * @param  string $display       Part of the text to associate number with
 409       * @param  null   $linkOverride
 410       * @return string
 411       */
 412      protected function buildlinkList($link, $display, $linkOverride = null)
 413      {
 414          $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
 415          if ($linkMethod == 'none') {
 416              return $display;
 417          }
 418  
 419          // Ignored link types
 420          if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link))) {
 421              return $display;
 422          }
 423  
 424          if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
 425              $url = $link;
 426          } else {
 427              $url = $this->baseurl;
 428              if (mb_substr($link, 0, 1) != '/') {
 429                  $url .= '/';
 430              }
 431              $url .= $link;
 432          }
 433  
 434          if ($linkMethod == 'table') {
 435              if (($index = array_search($url, $this->linkList)) === false) {
 436                  $index = count($this->linkList);
 437                  $this->linkList[] = $url;
 438              }
 439  
 440              return $display . ' [' . ($index + 1) . ']';
 441          } elseif ($linkMethod == 'nextline') {
 442              if ($url === $display) {
 443                  return $display;
 444              }
 445              return $display . "\n[" . $url . ']';
 446          } elseif ($linkMethod == 'bbcode') {
 447              return sprintf('[url=%s]%s[/url]', $url, $display);
 448          } else { // link_method defaults to inline
 449              if ($url === $display) {
 450                  return $display;
 451              }
 452              return $display . ' [' . $url . ']';
 453          }
 454      }
 455  
 456      /**
 457       * Helper function for PRE body conversion.
 458       *
 459       * @param string &$text HTML content
 460       */
 461      protected function convertPre(&$text)
 462      {
 463          // get the content of PRE element
 464          while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 465              // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
 466              $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
 467  
 468              // Run our defined tags search-and-replace with callback
 469              $this->preContent = preg_replace_callback(
 470                  $this->callbackSearch,
 471                  array($this, 'pregCallback'),
 472                  $this->preContent
 473              );
 474  
 475              // convert the content
 476              $this->preContent = sprintf(
 477                  '<div><br>%s<br></div>',
 478                  preg_replace($this->preSearch, $this->preReplace, $this->preContent)
 479              );
 480  
 481              // replace the content (use callback because content can contain $0 variable)
 482              $text = preg_replace_callback(
 483                  '/<pre[^>]*>.*<\/pre>/ismU',
 484                  array($this, 'pregPreCallback'),
 485                  $text,
 486                  1
 487              );
 488  
 489              // free memory
 490              $this->preContent = '';
 491          }
 492      }
 493  
 494      /**
 495       * Helper function for BLOCKQUOTE body conversion.
 496       *
 497       * @param string &$text HTML content
 498       */
 499      protected function convertBlockquotes(&$text)
 500      {
 501          if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 502              $originalText = $text;
 503              $start = 0;
 504              $taglen = 0;
 505              $level = 0;
 506              $diff = 0;
 507              foreach ($matches[0] as $m) {
 508                  $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
 509                  if ($m[0][0] == '<' && $m[0][1] == '/') {
 510                      $level--;
 511                      if ($level < 0) {
 512                          $level = 0; // malformed HTML: go to next blockquote
 513                      } elseif ($level > 0) {
 514                          // skip inner blockquote
 515                      } else {
 516                          $end = $m[1];
 517                          $len = $end - $taglen - $start;
 518                          // Get blockquote content
 519                          $body = mb_substr($text, $start + $taglen - $diff, $len);
 520  
 521                          // Set text width
 522                          $pWidth = $this->options['width'];
 523                          if ($this->options['width'] > 0) $this->options['width'] -= 2;
 524                          // Convert blockquote content
 525                          $body = trim($body);
 526                          $this->converter($body);
 527                          // Add citation markers and create PRE block
 528                          $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 529                          $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
 530                          // Re-set text width
 531                          $this->options['width'] = $pWidth;
 532                          // Replace content
 533                          $text = mb_substr($text, 0, $start - $diff)
 534                              . $body
 535                              . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
 536  
 537                          $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
 538                          unset($body);
 539                      }
 540                  } else {
 541                      if ($level == 0) {
 542                          $start = $m[1];
 543                          $taglen = mb_strlen($m[0]);
 544                      }
 545                      $level++;
 546                  }
 547              }
 548          }
 549      }
 550  
 551      /**
 552       * Callback function for preg_replace_callback use.
 553       *
 554       * @param  array  $matches PREG matches
 555       * @return string
 556       */
 557      protected function pregCallback($matches)
 558      {
 559          switch (mb_strtolower($matches[1])) {
 560              case 'p':
 561                  // Replace newlines with spaces.
 562                  $para = str_replace("\n", " ", $matches[3]);
 563  
 564                  // Trim trailing and leading whitespace within the tag.
 565                  $para = trim($para);
 566  
 567                  // Add trailing newlines for this para.
 568                  return "\n" . $para . "\n";
 569              case 'br':
 570                  return "\n";
 571              case 'b':
 572              case 'strong':
 573                  return $this->toupper($matches[3]);
 574              case 'del':
 575                  return $this->tostrike($matches[3]);
 576              case 'th':
 577                  return $this->toupper("\t\t" . $matches[3] . "\n");
 578              case 'h':
 579                  return $this->toupper("\n\n" . $matches[3] . "\n\n");
 580              case 'a':
 581                  // override the link method
 582                  $linkOverride = null;
 583                  if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
 584                      $linkOverride = $linkOverrideMatch[1];
 585                  }
 586                  // Remove spaces in URL (#1487805)
 587                  $url = str_replace(' ', '', $matches[3]);
 588  
 589                  return $this->buildlinkList($url, $matches[5], $linkOverride);
 590          }
 591  
 592          return '';
 593      }
 594  
 595      /**
 596       * Callback function for preg_replace_callback use in PRE content handler.
 597       *
 598       * @param  array  $matches PREG matches
 599       * @return string
 600       */
 601      protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
 602      {
 603          return $this->preContent;
 604      }
 605  
 606      /**
 607       * Strtoupper function with HTML tags and entities handling.
 608       *
 609       * @param  string $str Text to convert
 610       * @return string Converted text
 611       */
 612      protected function toupper($str)
 613      {
 614          // string can contain HTML tags
 615          $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 616  
 617          // convert toupper only the text between HTML tags
 618          foreach ($chunks as $i => $chunk) {
 619              if ($chunk[0] != '<') {
 620                  $chunks[$i] = $this->strtoupper($chunk);
 621              }
 622          }
 623  
 624          return implode($chunks);
 625      }
 626  
 627      /**
 628       * Strtoupper multibyte wrapper function with HTML entities handling.
 629       *
 630       * @param  string $str Text to convert
 631       * @return string Converted text
 632       */
 633      protected function strtoupper($str)
 634      {
 635          $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
 636          $str = mb_strtoupper($str);
 637          $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
 638  
 639          return $str;
 640      }
 641  
 642      /**
 643       * Helper function for DEL conversion.
 644       *
 645       * @param  string $text HTML content
 646       * @return string Converted text
 647       */
 648      protected function tostrike($str)
 649      {
 650          $rtn = '';
 651          for ($i = 0; $i < mb_strlen($str); $i++) {
 652              $chr = mb_substr($str, $i, 1);
 653              $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F);
 654              $rtn .= $chr . $combiningChr;
 655          }
 656          return $rtn;
 657      }
 658  }