Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.2.x will end 22 April 2024 (12 months).
  • Bug fixes for security issues in 4.2.x will end 7 October 2024 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.1.x is supported too.

Differences Between: [Versions 310 and 402] [Versions 39 and 402]

   1  <?php
   2  
   3  /*
   4   * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
   5   *
   6   * This script is free software; you can redistribute it and/or modify
   7   * it under the terms of the GNU General Public License as published by
   8   * the Free Software Foundation; either version 2 of the License, or
   9   * (at your option) any later version.
  10   *
  11   * The GNU General Public License can be found at
  12   * http://www.gnu.org/copyleft/gpl.html.
  13   *
  14   * This script is distributed in the hope that it will be useful,
  15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17   * GNU General Public License for more details.
  18   */
  19  
  20  namespace Html2Text;
  21  
  22  class Html2Text
  23  {
  24      const ENCODING = 'UTF-8';
  25  
  26      protected $htmlFuncFlags;
  27  
  28      /**
  29       * Contains the HTML content to convert.
  30       *
  31       * @var string $html
  32       */
  33      protected $html;
  34  
  35      /**
  36       * Contains the converted, formatted text.
  37       *
  38       * @var string $text
  39       */
  40      protected $text;
  41  
  42      /**
  43       * List of preg* regular expression patterns to search for,
  44       * used in conjunction with $replace.
  45       *
  46       * @var array $search
  47       * @see $replace
  48       */
  49      protected $search = array(
  50          "/\r/",                                           // Non-legal carriage return
  51          "/[\n\t]+/",                                      // Newlines and tabs
  52          '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
  53          '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
  54          '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
  55          '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
  56          '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
  57          '/<ins\b[^>]*>(.*?)<\/ins>/i',                    // <ins>
  58          '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
  59          '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
  60          '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
  61          '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
  62          '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
  63          '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
  64          '/<li\b[^>]*>/i',                                 // <li>
  65          '/<hr\b[^>]*>/i',                                 // <hr>
  66          '/<div\b[^>]*>/i',                                // <div>
  67          '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
  68          '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
  69          '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
  70          '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
  71          '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
  72      );
  73  
  74      /**
  75       * List of pattern replacements corresponding to patterns searched.
  76       *
  77       * @var array $replace
  78       * @see $search
  79       */
  80      protected $replace = array(
  81          '',                              // Non-legal carriage return
  82          ' ',                             // Newlines and tabs
  83          '',                              // <head>
  84          '',                              // <script>s -- which strip_tags supposedly has problems with
  85          '',                              // <style>s -- which strip_tags supposedly has problems with
  86          '_\\1_',                         // <i>
  87          '_\\1_',                         // <em>
  88          '_\\1_',                         // <ins>
  89          "\n\n",                          // <ul> and </ul>
  90          "\n\n",                          // <ol> and </ol>
  91          "\n\n",                          // <dl> and </dl>
  92          "\t* \\1\n",                     // <li> and </li>
  93          " \\1\n",                        // <dd> and </dd>
  94          "\t* \\1",                       // <dt> and </dt>
  95          "\n\t* ",                        // <li>
  96          "\n-------------------------\n", // <hr>
  97          "<div>\n",                       // <div>
  98          "\n\n",                          // <table> and </table>
  99          "\n",                            // <tr> and </tr>
 100          "\t\t\\1\n",                     // <td> and </td>
 101          "",                              // <span class="_html2text_ignore">...</span>
 102          '[\\2]',                         // <img> with alt tag
 103      );
 104  
 105      /**
 106       * List of preg* regular expression patterns to search for,
 107       * used in conjunction with $entReplace.
 108       *
 109       * @var array $entSearch
 110       * @see $entReplace
 111       */
 112      protected $entSearch = array(
 113          '/&#153;/i',                                     // TM symbol in win-1252
 114          '/&#151;/i',                                     // m-dash in win-1252
 115          '/&(amp|#38);/i',                                // Ampersand: see converter()
 116          '/[ ]{2,}/',                                     // Runs of spaces, post-handling
 117          '/&#39;/i',                                      // The apostrophe symbol
 118      );
 119  
 120      /**
 121       * List of pattern replacements corresponding to patterns searched.
 122       *
 123       * @var array $entReplace
 124       * @see $entSearch
 125       */
 126      protected $entReplace = array(
 127          '™',         // TM symbol
 128          '—',         // m-dash
 129          '|+|amp|+|', // Ampersand: see converter()
 130          ' ',         // Runs of spaces, post-handling
 131          '\'',        // Apostrophe
 132      );
 133  
 134      /**
 135       * List of preg* regular expression patterns to search for
 136       * and replace using callback function.
 137       *
 138       * @var array $callbackSearch
 139       */
 140      protected $callbackSearch = array(
 141          '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
 142          '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
 143          '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
 144          '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
 145          '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
 146          '/<(del)( [^>]*)?>(.*?)<\/del>/i',                       // <del>
 147          '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
 148          '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
 149      );
 150  
 151      /**
 152       * List of preg* regular expression patterns to search for in PRE body,
 153       * used in conjunction with $preReplace.
 154       *
 155       * @var array $preSearch
 156       * @see $preReplace
 157       */
 158      protected $preSearch = array(
 159          "/\n/",
 160          "/\t/",
 161          '/ /',
 162          '/<pre[^>]*>/',
 163          '/<\/pre>/'
 164      );
 165  
 166      /**
 167       * List of pattern replacements corresponding to patterns searched for PRE body.
 168       *
 169       * @var array $preReplace
 170       * @see $preSearch
 171       */
 172      protected $preReplace = array(
 173          '<br>',
 174          '&nbsp;&nbsp;&nbsp;&nbsp;',
 175          '&nbsp;',
 176          '',
 177          '',
 178      );
 179  
 180      /**
 181       * Temporary workspace used during PRE processing.
 182       *
 183       * @var string $preContent
 184       */
 185      protected $preContent = '';
 186  
 187      /**
 188       * Contains the base URL that relative links should resolve to.
 189       *
 190       * @var string $baseurl
 191       */
 192      protected $baseurl = '';
 193  
 194      /**
 195       * Indicates whether content in the $html variable has been converted yet.
 196       *
 197       * @var boolean $converted
 198       * @see $html, $text
 199       */
 200      protected $converted = false;
 201  
 202      /**
 203       * Contains URL addresses from links to be rendered in plain text.
 204       *
 205       * @var array $linkList
 206       * @see buildlinkList()
 207       */
 208      protected $linkList = array();
 209  
 210      /**
 211       * Various configuration options (able to be set in the constructor)
 212       *
 213       * @var array $options
 214       */
 215      protected $options = array(
 216          'do_links' => 'inline', // 'none'
 217                                  // 'inline' (show links inline)
 218                                  // 'nextline' (show links on the next line)
 219                                  // 'table' (if a table of link URLs should be listed after the text.
 220                                  // 'bbcode' (show links as bbcode)
 221  
 222          'width' => 70,          //  Maximum width of the formatted text, in columns.
 223                                  //  Set this value to 0 (or less) to ignore word wrapping
 224                                  //  and not constrain text to a fixed-width column.
 225      );
 226  
 227      private function legacyConstruct($html = '', $fromFile = false, array $options = array())
 228      {
 229          $this->set_html($html, $fromFile);
 230          $this->options = array_merge($this->options, $options);
 231      }
 232  
 233      /**
 234       * @param string $html    Source HTML
 235       * @param array  $options Set configuration options
 236       */
 237      public function __construct($html = '', $options = array())
 238      {
 239          // for backwards compatibility
 240          if (!is_array($options)) {
 241              return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
 242          }
 243  
 244          $this->html = $html;
 245          $this->options = array_merge($this->options, $options);
 246          $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
 247              ? ENT_COMPAT
 248              : ENT_COMPAT | ENT_HTML5;
 249      }
 250  
 251      /**
 252      * Get the source HTML
 253      *
 254      * @return string
 255      */
 256      public function getHtml()
 257      {
 258          return $this->html;
 259      }
 260  
 261      /**
 262       * Set the source HTML
 263       *
 264       * @param string $html HTML source content
 265       */
 266      public function setHtml($html)
 267      {
 268          $this->html = $html;
 269          $this->converted = false;
 270      }
 271  
 272      /**
 273       * @deprecated
 274       */
 275      public function set_html($html, $from_file = false)
 276      {
 277          if ($from_file) {
 278              throw new \InvalidArgumentException("Argument from_file no longer supported");
 279          }
 280  
 281          return $this->setHtml($html);
 282      }
 283  
 284      /**
 285       * Returns the text, converted from HTML.
 286       *
 287       * @return string Plain text
 288       */
 289      public function getText()
 290      {
 291          if (!$this->converted) {
 292              $this->convert();
 293          }
 294  
 295          return $this->text;
 296      }
 297  
 298      /**
 299       * @deprecated
 300       */
 301      public function get_text()
 302      {
 303          return $this->getText();
 304      }
 305  
 306      /**
 307       * @deprecated
 308       */
 309      public function print_text()
 310      {
 311          print $this->getText();
 312      }
 313  
 314      /**
 315       * @deprecated
 316       */
 317      public function p()
 318      {
 319          return $this->print_text();
 320      }
 321  
 322      /**
 323       * Sets a base URL to handle relative links.
 324       *
 325       * @param string $baseurl
 326       */
 327      public function setBaseUrl($baseurl)
 328      {
 329          $this->baseurl = $baseurl;
 330      }
 331  
 332      /**
 333       * @deprecated
 334       */
 335      public function set_base_url($baseurl)
 336      {
 337          return $this->setBaseUrl($baseurl);
 338      }
 339  
 340      protected function convert()
 341      {
 342         $origEncoding = mb_internal_encoding();
 343         mb_internal_encoding(self::ENCODING);
 344  
 345         $this->doConvert();
 346  
 347         mb_internal_encoding($origEncoding);
 348      }
 349  
 350      protected function doConvert()
 351      {
 352          $this->linkList = array();
 353  
 354          $text = trim($this->html);
 355  
 356          $this->converter($text);
 357  
 358          if ($this->linkList) {
 359              $text .= "\n\nLinks:\n------\n";
 360              foreach ($this->linkList as $i => $url) {
 361                  $text .= '[' . ($i + 1) . '] ' . $url . "\n";
 362              }
 363          }
 364  
 365          $this->text = $text;
 366  
 367          $this->converted = true;
 368      }
 369  
 370      protected function converter(&$text)
 371      {
 372          $this->convertBlockquotes($text);
 373          $this->convertPre($text);
 374          $text = preg_replace($this->search, $this->replace, $text);
 375          $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
 376          $text = strip_tags($text);
 377          $text = preg_replace($this->entSearch, $this->entReplace, $text);
 378          $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
 379  
 380          // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 381          $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 382  
 383          // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 384          // This properly handles situation of "&amp;quot;" in input string
 385          $text = str_replace('|+|amp|+|', '&', $text);
 386  
 387          // Normalise empty lines
 388          $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 389          $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 390  
 391          // remove leading empty lines (can be produced by eg. P tag on the beginning)
 392          $text = ltrim($text, "\n");
 393  
 394          if ($this->options['width'] > 0) {
 395              $text = wordwrap($text, $this->options['width']);
 396          }
 397      }
 398  
 399      /**
 400       * Helper function called by preg_replace() on link replacement.
 401       *
 402       * Maintains an internal list of links to be displayed at the end of the
 403       * text, with numeric indices to the original point in the text they
 404       * appeared. Also makes an effort at identifying and handling absolute
 405       * and relative links.
 406       *
 407       * @param  string $link          URL of the link
 408       * @param  string $display       Part of the text to associate number with
 409       * @param  null   $linkOverride
 410       * @return string
 411       */
 412      protected function buildlinkList($link, $display, $linkOverride = null)
 413      {
 414          $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
 415          if ($linkMethod == 'none') {
 416              return $display;
 417          }
 418  
 419          // Ignored link types
 420          if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link))) {
 421              return $display;
 422          }
 423  
 424          if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
 425              $url = $link;
 426          } else {
 427              $url = $this->baseurl;
 428              if (mb_substr($link, 0, 1) != '/') {
 429                  $url .= '/';
 430              }
 431              $url .= $link;
 432          }
 433  
 434          if ($linkMethod == 'table') {
 435              if (($index = array_search($url, $this->linkList)) === false) {
 436                  $index = count($this->linkList);
 437                  $this->linkList[] = $url;
 438              }
 439  
 440              return $display . ' [' . ($index + 1) . ']';
 441          } elseif ($linkMethod == 'nextline') {
 442              if ($url === $display) {
 443                  return $display;
 444              }
 445              return $display . "\n[" . $url . ']';
 446          } elseif ($linkMethod == 'bbcode') {
 447              return sprintf('[url=%s]%s[/url]', $url, $display);
 448          } else { // link_method defaults to inline
 449              if ($url === $display) {
 450                  return $display;
 451              }
 452              return $display . ' [' . $url . ']';
 453          }
 454      }
 455  
 456      /**
 457       * Helper function for PRE body conversion.
 458       *
 459       * @param string &$text HTML content
 460       */
 461      protected function convertPre(&$text)
 462      {
 463          // get the content of PRE element
 464          while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 465              // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
 466              $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
 467  
 468              // Run our defined tags search-and-replace with callback
 469              $this->preContent = preg_replace_callback(
 470                  $this->callbackSearch,
 471                  array($this, 'pregCallback'),
 472                  $this->preContent
 473              );
 474  
 475              // convert the content
 476              $this->preContent = sprintf(
 477                  '<div><br>%s<br></div>',
 478                  preg_replace($this->preSearch, $this->preReplace, $this->preContent)
 479              );
 480  
 481              // replace the content (use callback because content can contain $0 variable)
 482              $text = preg_replace_callback(
 483                  '/<pre[^>]*>.*<\/pre>/ismU',
 484                  array($this, 'pregPreCallback'),
 485                  $text,
 486                  1
 487              );
 488  
 489              // free memory
 490              $this->preContent = '';
 491          }
 492      }
 493  
 494      /**
 495       * Helper function for BLOCKQUOTE body conversion.
 496       *
 497       * @param string &$text HTML content
 498       */
 499      protected function convertBlockquotes(&$text)
 500      {
 501          if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 502              $originalText = $text;
 503              $start = 0;
 504              $taglen = 0;
 505              $level = 0;
 506              $diff = 0;
 507              foreach ($matches[0] as $m) {
 508                  $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
 509                  if ($m[0][0] == '<' && $m[0][1] == '/') {
 510                      $level--;
 511                      if ($level < 0) {
 512                          $level = 0; // malformed HTML: go to next blockquote
 513                      } elseif ($level > 0) {
 514                          // skip inner blockquote
 515                      } else {
 516                          $end = $m[1];
 517                          $len = $end - $taglen - $start;
 518                          // Get blockquote content
 519                          $body = mb_substr($text, $start + $taglen - $diff, $len);
 520  
 521                          // Set text width
 522                          $pWidth = $this->options['width'];
 523                          if ($this->options['width'] > 0) $this->options['width'] -= 2;
 524                          // Convert blockquote content
 525                          $body = trim($body);
 526                          $this->converter($body);
 527                          // Add citation markers and create PRE block
 528                          $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 529                          $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
 530                          // Re-set text width
 531                          $this->options['width'] = $pWidth;
 532                          // Replace content
 533                          $text = mb_substr($text, 0, $start - $diff)
 534                              . $body
 535                              . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
 536  
 537                          $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
 538                          unset($body);
 539                      }
 540                  } else {
 541                      if ($level == 0) {
 542                          $start = $m[1];
 543                          $taglen = mb_strlen($m[0]);
 544                      }
 545                      $level++;
 546                  }
 547              }
 548          }
 549      }
 550  
 551      /**
 552       * Callback function for preg_replace_callback use.
 553       *
 554       * @param  array  $matches PREG matches
 555       * @return string
 556       */
 557      protected function pregCallback($matches)
 558      {
 559          switch (mb_strtolower($matches[1])) {
 560              case 'p':
 561                  // Replace newlines with spaces.
 562                  $para = str_replace("\n", " ", $matches[3]);
 563  
 564                  // Trim trailing and leading whitespace within the tag.
 565                  $para = trim($para);
 566  
 567                  // Add trailing newlines for this para.
 568                  return "\n" . $para . "\n";
 569              case 'br':
 570                  return "\n";
 571              case 'b':
 572              case 'strong':
 573                  return $this->toupper($matches[3]);
 574              case 'del':
 575                  return $this->tostrike($matches[3]);
 576              case 'th':
 577                  return $this->toupper("\t\t" . $matches[3] . "\n");
 578              case 'h':
 579                  return $this->toupper("\n\n" . $matches[3] . "\n\n");
 580              case 'a':
 581                  // override the link method
 582                  $linkOverride = null;
 583                  if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
 584                      $linkOverride = $linkOverrideMatch[1];
 585                  }
 586                  // Remove spaces in URL (#1487805)
 587                  $url = str_replace(' ', '', $matches[3]);
 588  
 589                  return $this->buildlinkList($url, $matches[5], $linkOverride);
 590          }
 591  
 592          return '';
 593      }
 594  
 595      /**
 596       * Callback function for preg_replace_callback use in PRE content handler.
 597       *
 598       * @param  array  $matches PREG matches
 599       * @return string
 600       */
 601      protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
 602      {
 603          return $this->preContent;
 604      }
 605  
 606      /**
 607       * Strtoupper function with HTML tags and entities handling.
 608       *
 609       * @param  string $str Text to convert
 610       * @return string Converted text
 611       */
 612      protected function toupper($str)
 613      {
 614          // string can contain HTML tags
 615          $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 616  
 617          // convert toupper only the text between HTML tags
 618          foreach ($chunks as $i => $chunk) {
 619              if ($chunk[0] != '<') {
 620                  $chunks[$i] = $this->strtoupper($chunk);
 621              }
 622          }
 623  
 624          return implode($chunks);
 625      }
 626  
 627      /**
 628       * Strtoupper multibyte wrapper function with HTML entities handling.
 629       *
 630       * @param  string $str Text to convert
 631       * @return string Converted text
 632       */
 633      protected function strtoupper($str)
 634      {
 635          $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
 636          $str = mb_strtoupper($str);
 637          $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
 638  
 639          return $str;
 640      }
 641  
 642      /**
 643       * Helper function for DEL conversion.
 644       *
 645       * @param  string $text HTML content
 646       * @return string Converted text
 647       */
 648      protected function tostrike($str)
 649      {
 650          $rtn = '';
 651          for ($i = 0; $i < mb_strlen($str); $i++) {
 652              $chr = mb_substr($str, $i, 1);
 653              $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F);
 654              $rtn .= $chr . $combiningChr;
 655          }
 656          return $rtn;
 657      }
 658  }