Search moodle.org's
Developer Documentation

See Release Notes
Long Term Support Release

  • Bug fixes for general core bugs in 3.9.x will end* 10 May 2021 (12 months).
  • Bug fixes for security issues in 3.9.x will end* 8 May 2023 (36 months).
  • PHP version: minimum PHP 7.2.0 Note: minimum PHP version has increased since Moodle 3.8. PHP 7.3.x and 7.4.x are supported too.

Differences Between: [Versions 39 and 311] [Versions 39 and 400] [Versions 39 and 401] [Versions 39 and 402] [Versions 39 and 403]

   1  <?php
   2  
   3  /*
   4   * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
   5   *
   6   * This script is free software; you can redistribute it and/or modify
   7   * it under the terms of the GNU General Public License as published by
   8   * the Free Software Foundation; either version 2 of the License, or
   9   * (at your option) any later version.
  10   *
  11   * The GNU General Public License can be found at
  12   * http://www.gnu.org/copyleft/gpl.html.
  13   *
  14   * This script is distributed in the hope that it will be useful,
  15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17   * GNU General Public License for more details.
  18   */
  19  
  20  namespace Html2Text;
  21  
  22  class Html2Text
  23  {
  24      const ENCODING = 'UTF-8';
  25  
  26      protected $htmlFuncFlags;
  27  
  28      /**
  29       * Contains the HTML content to convert.
  30       *
  31       * @type string
  32       */
  33      protected $html;
  34  
  35      /**
  36       * Contains the converted, formatted text.
  37       *
  38       * @type string
  39       */
  40      protected $text;
  41  
  42      /**
  43       * List of preg* regular expression patterns to search for,
  44       * used in conjunction with $replace.
  45       *
  46       * @type array
  47       * @see $replace
  48       */
  49      protected $search = array(
  50          "/\r/",                                           // Non-legal carriage return
  51          "/[\n\t]+/",                                      // Newlines and tabs
  52          '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
  53          '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
  54          '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
  55          '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
  56          '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
  57          '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
  58          '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
  59          '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
  60          '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
  61          '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
  62          '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
  63          '/<li\b[^>]*>/i',                                 // <li>
  64          '/<hr\b[^>]*>/i',                                 // <hr>
  65          '/<div\b[^>]*>/i',                                // <div>
  66          '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
  67          '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
  68          '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
  69          '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
  70          '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
  71      );
  72  
  73      /**
  74       * List of pattern replacements corresponding to patterns searched.
  75       *
  76       * @type array
  77       * @see $search
  78       */
  79      protected $replace = array(
  80          '',                              // Non-legal carriage return
  81          ' ',                             // Newlines and tabs
  82          '',                              // <head>
  83          '',                              // <script>s -- which strip_tags supposedly has problems with
  84          '',                              // <style>s -- which strip_tags supposedly has problems with
  85          '_\\1_',                         // <i>
  86          '_\\1_',                         // <em>
  87          "\n\n",                          // <ul> and </ul>
  88          "\n\n",                          // <ol> and </ol>
  89          "\n\n",                          // <dl> and </dl>
  90          "\t* \\1\n",                     // <li> and </li>
  91          " \\1\n",                        // <dd> and </dd>
  92          "\t* \\1",                       // <dt> and </dt>
  93          "\n\t* ",                        // <li>
  94          "\n-------------------------\n", // <hr>
  95          "<div>\n",                       // <div>
  96          "\n\n",                          // <table> and </table>
  97          "\n",                            // <tr> and </tr>
  98          "\t\t\\1\n",                     // <td> and </td>
  99          "",                              // <span class="_html2text_ignore">...</span>
 100          '[\\2]',                         // <img> with alt tag
 101      );
 102  
 103      /**
 104       * List of preg* regular expression patterns to search for,
 105       * used in conjunction with $entReplace.
 106       *
 107       * @type array
 108       * @see $entReplace
 109       */
 110      protected $entSearch = array(
 111          '/&#153;/i',                                     // TM symbol in win-1252
 112          '/&#151;/i',                                     // m-dash in win-1252
 113          '/&(amp|#38);/i',                                // Ampersand: see converter()
 114          '/[ ]{2,}/',                                     // Runs of spaces, post-handling
 115          '/&#39;/i',                                      // The apostrophe symbol
 116      );
 117  
 118      /**
 119       * List of pattern replacements corresponding to patterns searched.
 120       *
 121       * @type array
 122       * @see $entSearch
 123       */
 124      protected $entReplace = array(
 125          '™',         // TM symbol
 126          '—',         // m-dash
 127          '|+|amp|+|', // Ampersand: see converter()
 128          ' ',         // Runs of spaces, post-handling
 129          '\'',        // Apostrophe
 130      );
 131  
 132      /**
 133       * List of preg* regular expression patterns to search for
 134       * and replace using callback function.
 135       *
 136       * @type array
 137       */
 138      protected $callbackSearch = array(
 139          '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
 140          '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
 141          '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
 142          '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
 143          '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
 144          '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
 145          '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
 146      );
 147  
 148      /**
 149       * List of preg* regular expression patterns to search for in PRE body,
 150       * used in conjunction with $preReplace.
 151       *
 152       * @type array
 153       * @see $preReplace
 154       */
 155      protected $preSearch = array(
 156          "/\n/",
 157          "/\t/",
 158          '/ /',
 159          '/<pre[^>]*>/',
 160          '/<\/pre>/'
 161      );
 162  
 163      /**
 164       * List of pattern replacements corresponding to patterns searched for PRE body.
 165       *
 166       * @type array
 167       * @see $preSearch
 168       */
 169      protected $preReplace = array(
 170          '<br>',
 171          '&nbsp;&nbsp;&nbsp;&nbsp;',
 172          '&nbsp;',
 173          '',
 174          '',
 175      );
 176  
 177      /**
 178       * Temporary workspace used during PRE processing.
 179       *
 180       * @type string
 181       */
 182      protected $preContent = '';
 183  
 184      /**
 185       * Contains the base URL that relative links should resolve to.
 186       *
 187       * @type string
 188       */
 189      protected $baseurl = '';
 190  
 191      /**
 192       * Indicates whether content in the $html variable has been converted yet.
 193       *
 194       * @type boolean
 195       * @see $html, $text
 196       */
 197      protected $converted = false;
 198  
 199      /**
 200       * Contains URL addresses from links to be rendered in plain text.
 201       *
 202       * @type array
 203       * @see buildlinkList()
 204       */
 205      protected $linkList = array();
 206  
 207      /**
 208       * Various configuration options (able to be set in the constructor)
 209       *
 210       * @type array
 211       */
 212      protected $options = array(
 213          'do_links' => 'inline', // 'none'
 214                                  // 'inline' (show links inline)
 215                                  // 'nextline' (show links on the next line)
 216                                  // 'table' (if a table of link URLs should be listed after the text.
 217                                  // 'bbcode' (show links as bbcode)
 218  
 219          'width' => 70,          //  Maximum width of the formatted text, in columns.
 220                                  //  Set this value to 0 (or less) to ignore word wrapping
 221                                  //  and not constrain text to a fixed-width column.
 222      );
 223  
 224      private function legacyConstruct($html = '', $fromFile = false, array $options = array())
 225      {
 226          $this->set_html($html, $fromFile);
 227          $this->options = array_merge($this->options, $options);
 228      }
 229  
 230      /**
 231       * @param string $html    Source HTML
 232       * @param array  $options Set configuration options
 233       */
 234      public function __construct($html = '', $options = array())
 235      {
 236          // for backwards compatibility
 237          if (!is_array($options)) {
 238              return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
 239          }
 240  
 241          $this->html = $html;
 242          $this->options = array_merge($this->options, $options);
 243          $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
 244              ? ENT_COMPAT
 245              : ENT_COMPAT | ENT_HTML5;
 246      }
 247  
 248      /**
 249      * Get the source HTML
 250      *
 251      * @return string
 252      */
 253      public function getHtml()
 254      {
 255          return $this->html;
 256      }
 257  
 258      /**
 259       * Set the source HTML
 260       *
 261       * @param string $html HTML source content
 262       */
 263      public function setHtml($html)
 264      {
 265          $this->html = $html;
 266          $this->converted = false;
 267      }
 268  
 269      /**
 270       * @deprecated
 271       */
 272      public function set_html($html, $from_file = false)
 273      {
 274          if ($from_file) {
 275              throw new \InvalidArgumentException("Argument from_file no longer supported");
 276          }
 277  
 278          return $this->setHtml($html);
 279      }
 280  
 281      /**
 282       * Returns the text, converted from HTML.
 283       *
 284       * @return string
 285       */
 286      public function getText()
 287      {
 288          if (!$this->converted) {
 289              $this->convert();
 290          }
 291  
 292          return $this->text;
 293      }
 294  
 295      /**
 296       * @deprecated
 297       */
 298      public function get_text()
 299      {
 300          return $this->getText();
 301      }
 302  
 303      /**
 304       * @deprecated
 305       */
 306      public function print_text()
 307      {
 308          print $this->getText();
 309      }
 310  
 311      /**
 312       * @deprecated
 313       */
 314      public function p()
 315      {
 316          return $this->print_text();
 317      }
 318  
 319      /**
 320       * Sets a base URL to handle relative links.
 321       *
 322       * @param string $baseurl
 323       */
 324      public function setBaseUrl($baseurl)
 325      {
 326          $this->baseurl = $baseurl;
 327      }
 328  
 329      /**
 330       * @deprecated
 331       */
 332      public function set_base_url($baseurl)
 333      {
 334          return $this->setBaseUrl($baseurl);
 335      }
 336  
 337      protected function convert()
 338      {
 339         $origEncoding = mb_internal_encoding();
 340         mb_internal_encoding(self::ENCODING);
 341  
 342         $this->doConvert();
 343  
 344         mb_internal_encoding($origEncoding);
 345      }
 346  
 347      protected function doConvert()
 348      {
 349          $this->linkList = array();
 350  
 351          $text = trim($this->html);
 352  
 353          $this->converter($text);
 354  
 355          if ($this->linkList) {
 356              $text .= "\n\nLinks:\n------\n";
 357              foreach ($this->linkList as $i => $url) {
 358                  $text .= '[' . ($i + 1) . '] ' . $url . "\n";
 359              }
 360          }
 361  
 362          $this->text = $text;
 363  
 364          $this->converted = true;
 365      }
 366  
 367      protected function converter(&$text)
 368      {
 369          $this->convertBlockquotes($text);
 370          $this->convertPre($text);
 371          $text = preg_replace($this->search, $this->replace, $text);
 372          $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
 373          $text = strip_tags($text);
 374          $text = preg_replace($this->entSearch, $this->entReplace, $text);
 375          $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
 376  
 377          // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 378          $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 379  
 380          // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 381          // This properly handles situation of "&amp;quot;" in input string
 382          $text = str_replace('|+|amp|+|', '&', $text);
 383  
 384          // Normalise empty lines
 385          $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 386          $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 387  
 388          // remove leading empty lines (can be produced by eg. P tag on the beginning)
 389          $text = ltrim($text, "\n");
 390  
 391          if ($this->options['width'] > 0) {
 392              $text = wordwrap($text, $this->options['width']);
 393          }
 394      }
 395  
 396      /**
 397       * Helper function called by preg_replace() on link replacement.
 398       *
 399       * Maintains an internal list of links to be displayed at the end of the
 400       * text, with numeric indices to the original point in the text they
 401       * appeared. Also makes an effort at identifying and handling absolute
 402       * and relative links.
 403       *
 404       * @param  string $link          URL of the link
 405       * @param  string $display       Part of the text to associate number with
 406       * @param  null   $linkOverride
 407       * @return string
 408       */
 409      protected function buildlinkList($link, $display, $linkOverride = null)
 410      {
 411          $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
 412          if ($linkMethod == 'none') {
 413              return $display;
 414          }
 415  
 416          // Ignored link types
 417          if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
 418              return $display;
 419          }
 420  
 421          if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
 422              $url = $link;
 423          } else {
 424              $url = $this->baseurl;
 425              if (mb_substr($link, 0, 1) != '/') {
 426                  $url .= '/';
 427              }
 428              $url .= $link;
 429          }
 430  
 431          if ($linkMethod == 'table') {
 432              if (($index = array_search($url, $this->linkList)) === false) {
 433                  $index = count($this->linkList);
 434                  $this->linkList[] = $url;
 435              }
 436  
 437              return $display . ' [' . ($index + 1) . ']';
 438          } elseif ($linkMethod == 'nextline') {
 439              if ($url === $display) {
 440                  return $display;
 441              }
 442              return $display . "\n[" . $url . ']';
 443          } elseif ($linkMethod == 'bbcode') {
 444              return sprintf('[url=%s]%s[/url]', $url, $display);
 445          } else { // link_method defaults to inline
 446              if ($url === $display) {
 447                  return $display;
 448              }
 449              return $display . ' [' . $url . ']';
 450          }
 451      }
 452  
 453      protected function convertPre(&$text)
 454      {
 455          // get the content of PRE element
 456          while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 457              // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
 458              $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
 459  
 460              // Run our defined tags search-and-replace with callback
 461              $this->preContent = preg_replace_callback(
 462                  $this->callbackSearch,
 463                  array($this, 'pregCallback'),
 464                  $this->preContent
 465              );
 466  
 467              // convert the content
 468              $this->preContent = sprintf(
 469                  '<div><br>%s<br></div>',
 470                  preg_replace($this->preSearch, $this->preReplace, $this->preContent)
 471              );
 472  
 473              // replace the content (use callback because content can contain $0 variable)
 474              $text = preg_replace_callback(
 475                  '/<pre[^>]*>.*<\/pre>/ismU',
 476                  array($this, 'pregPreCallback'),
 477                  $text,
 478                  1
 479              );
 480  
 481              // free memory
 482              $this->preContent = '';
 483          }
 484      }
 485  
 486      /**
 487       * Helper function for BLOCKQUOTE body conversion.
 488       *
 489       * @param string $text HTML content
 490       */
 491      protected function convertBlockquotes(&$text)
 492      {
 493          if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 494              $originalText = $text;
 495              $start = 0;
 496              $taglen = 0;
 497              $level = 0;
 498              $diff = 0;
 499              foreach ($matches[0] as $m) {
 500                  $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
 501                  if ($m[0][0] == '<' && $m[0][1] == '/') {
 502                      $level--;
 503                      if ($level < 0) {
 504                          $level = 0; // malformed HTML: go to next blockquote
 505                      } elseif ($level > 0) {
 506                          // skip inner blockquote
 507                      } else {
 508                          $end = $m[1];
 509                          $len = $end - $taglen - $start;
 510                          // Get blockquote content
 511                          $body = mb_substr($text, $start + $taglen - $diff, $len);
 512  
 513                          // Set text width
 514                          $pWidth = $this->options['width'];
 515                          if ($this->options['width'] > 0) $this->options['width'] -= 2;
 516                          // Convert blockquote content
 517                          $body = trim($body);
 518                          $this->converter($body);
 519                          // Add citation markers and create PRE block
 520                          $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 521                          $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
 522                          // Re-set text width
 523                          $this->options['width'] = $pWidth;
 524                          // Replace content
 525                          $text = mb_substr($text, 0, $start - $diff)
 526                              . $body
 527                              . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
 528  
 529                          $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
 530                          unset($body);
 531                      }
 532                  } else {
 533                      if ($level == 0) {
 534                          $start = $m[1];
 535                          $taglen = mb_strlen($m[0]);
 536                      }
 537                      $level++;
 538                  }
 539              }
 540          }
 541      }
 542  
 543      /**
 544       * Callback function for preg_replace_callback use.
 545       *
 546       * @param  array  $matches PREG matches
 547       * @return string
 548       */
 549      protected function pregCallback($matches)
 550      {
 551          switch (mb_strtolower($matches[1])) {
 552              case 'p':
 553                  // Replace newlines with spaces.
 554                  $para = str_replace("\n", " ", $matches[3]);
 555  
 556                  // Trim trailing and leading whitespace within the tag.
 557                  $para = trim($para);
 558  
 559                  // Add trailing newlines for this para.
 560                  return "\n" . $para . "\n";
 561              case 'br':
 562                  return "\n";
 563              case 'b':
 564              case 'strong':
 565                  return $this->toupper($matches[3]);
 566              case 'th':
 567                  return $this->toupper("\t\t" . $matches[3] . "\n");
 568              case 'h':
 569                  return $this->toupper("\n\n" . $matches[3] . "\n\n");
 570              case 'a':
 571                  // override the link method
 572                  $linkOverride = null;
 573                  if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
 574                      $linkOverride = $linkOverrideMatch[1];
 575                  }
 576                  // Remove spaces in URL (#1487805)
 577                  $url = str_replace(' ', '', $matches[3]);
 578  
 579                  return $this->buildlinkList($url, $matches[5], $linkOverride);
 580          }
 581  
 582          return '';
 583      }
 584  
 585      /**
 586       * Callback function for preg_replace_callback use in PRE content handler.
 587       *
 588       * @param  array  $matches PREG matches
 589       * @return string
 590       */
 591      protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
 592      {
 593          return $this->preContent;
 594      }
 595  
 596      /**
 597       * Strtoupper function with HTML tags and entities handling.
 598       *
 599       * @param  string $str Text to convert
 600       * @return string Converted text
 601       */
 602      protected function toupper($str)
 603      {
 604          // string can contain HTML tags
 605          $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 606  
 607          // convert toupper only the text between HTML tags
 608          foreach ($chunks as $i => $chunk) {
 609              if ($chunk[0] != '<') {
 610                  $chunks[$i] = $this->strtoupper($chunk);
 611              }
 612          }
 613  
 614          return implode($chunks);
 615      }
 616  
 617      /**
 618       * Strtoupper multibyte wrapper function with HTML entities handling.
 619       *
 620       * @param  string $str Text to convert
 621       * @return string Converted text
 622       */
 623      protected function strtoupper($str)
 624      {
 625          $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
 626          $str = mb_strtoupper($str);
 627          $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
 628  
 629          return $str;
 630      }
 631  }