Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 3.10.x will end 8 November 2021 (12 months).
  • Bug fixes for security issues in 3.10.x will end 9 May 2022 (18 months).
  • PHP version: minimum PHP 7.2.0 Note: minimum PHP version has increased since Moodle 3.8. PHP 7.3.x and 7.4.x are supported too.

Differences Between: [Versions 310 and 401] [Versions 310 and 402] [Versions 310 and 403]

   1  <?php
   2  /**
   3   * Copyright 2010-2017 Horde LLC (http://www.horde.org/)
   4   *
   5   * See the enclosed file LICENSE for license information (LGPL). If you
   6   * did not receive this file, see http://www.horde.org/licenses/lgpl21.
   7   *
   8   * @category  Horde
   9   * @copyright 2010-2017 Horde LLC
  10   * @package   Util
  11   * @license   http://www.horde.org/licenses/lgpl21 LGPL 2.1
  12   */
  13  
  14  /**
  15   * Parse DOM data from HTML strings.
  16   *
  17   * @author    Michael Slusarz <slusarz@horde.org>
  18   * @category  Horde
  19   * @copyright 2010-2017 Horde LLC
  20   * @package   Util
  21   * @license   http://www.horde.org/licenses/lgpl21 LGPL 2.1
  22   */
  23  class Horde_Domhtml implements Iterator
  24  {
  25      /**
  26       * DOM object.
  27       *
  28       * @var DOMDocument
  29       */
  30      public $dom;
  31  
  32      /**
  33       * Iterator status.
  34       *
  35       * @var array
  36       */
  37      protected $_iterator = null;
  38  
  39      /**
  40       * Original charset of data.
  41       *
  42       * @var string
  43       */
  44      protected $_origCharset;
  45  
  46      /**
  47       * Encoding tag added to beginning of output.
  48       *
  49       * @var string
  50       */
  51      protected $_xmlencoding = '';
  52  
  53      /**
  54       * Constructor.
  55       *
  56       * @param string $text     The text of the HTML document.
  57       * @param string $charset  The charset of the HTML document.
  58       *
  59       * @throws Exception
  60       */
  61      public function __construct($text, $charset = null)
  62      {
  63          if (!extension_loaded('dom')) {
  64              throw new Exception('DOM extension is not available.');
  65          }
  66  
  67          // Bug #9616: Make sure we have valid HTML input.
  68          if (!strlen($text)) {
  69              $text = '<html></html>';
  70          }
  71  
  72          $old_error = libxml_use_internal_errors(true);
  73          $this->dom = new DOMDocument();
  74  
  75          if (is_null($charset)) {
  76              /* If no charset given, charset is whatever libxml tells us the
  77               * encoding should be defaulting to 'iso-8859-1'. */
  78              $this->_loadHTML($text);
  79              $this->_origCharset = $this->dom->encoding
  80                  ? $this->dom->encoding
  81                  : 'iso-8859-1';
  82          } else {
  83              /* Convert/try with UTF-8 first. */
  84              $this->_origCharset = Horde_String::lower($charset);
  85              $this->_xmlencoding = '<?xml encoding="UTF-8"?>';
  86              $this->_loadHTML(
  87                  $this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8')
  88              );
  89  
  90              if ($this->dom->encoding &&
  91                  (Horde_String::lower($this->dom->encoding) != 'utf-8')) {
  92                  /* Convert charset to what the HTML document says it SHOULD
  93                   * be. */
  94                  $this->_loadHTML(
  95                      Horde_String::convertCharset($text, $charset, $this->dom->encoding)
  96                  );
  97                  $this->_xmlencoding = '';
  98              }
  99          }
 100  
 101          if ($old_error) {
 102              libxml_use_internal_errors(false);
 103          }
 104  
 105          /* Sanity checking: make sure we have the documentElement object. */
 106          if (!$this->dom->documentElement) {
 107              $this->dom->appendChild($this->dom->createElement('html'));
 108          }
 109  
 110          /* Remove old charset information. */
 111          $xpath = new DOMXPath($this->dom);
 112          $domlist = $xpath->query('/html/head/meta[@http-equiv="content-type"]');
 113          for ($i = $domlist->length; $i > 0; --$i) {
 114              $meta = $domlist->item($i - 1);
 115              $meta->parentNode->removeChild($meta);
 116          }
 117      }
 118  
 119      /**
 120       * Returns the HEAD element, or creates one if it doesn't exist.
 121       *
 122       * @return DOMElement  HEAD element.
 123       */
 124      public function getHead()
 125      {
 126          $head = $this->dom->getElementsByTagName('head');
 127          if ($head->length) {
 128              return $head->item(0);
 129          }
 130  
 131          $headelt = $this->dom->createElement('head');
 132          $this->dom->documentElement->insertBefore($headelt, $this->dom->documentElement->firstChild);
 133  
 134          return $headelt;
 135      }
 136  
 137      /**
 138       * Returns the BODY element, or creates one if it doesn't exist.
 139       *
 140       * @since 2.2.0
 141       *
 142       * @return DOMElement  BODY element.
 143       */
 144      public function getBody()
 145      {
 146          $body = $this->dom->getElementsByTagName('body');
 147          if ($body->length) {
 148              return $body->item(0);
 149          }
 150  
 151          $bodyelt = $this->dom->createElement('body');
 152          $this->dom->documentElement->appendChild($bodyelt);
 153  
 154          return $bodyelt;
 155      }
 156  
 157      /**
 158       * Returns the full HTML text in the original charset.
 159       *
 160       * @param array $opts  Additional options: (since 2.1.0)
 161       *   - charset: (string) Return using this charset. If set but empty, will
 162       *              return as currently stored in the DOM object.
 163       *   - metacharset: (boolean) If true, will add a META tag containing the
 164       *                  charset information.
 165       *
 166       * @return string  HTML text.
 167       */
 168      public function returnHtml(array $opts = array())
 169      {
 170          $curr_charset = $this->getCharset();
 171          if (strcasecmp($curr_charset, 'US-ASCII') === 0) {
 172              $curr_charset = 'UTF-8';
 173          }
 174          $charset = array_key_exists('charset', $opts)
 175              ? (empty($opts['charset']) ? $curr_charset : $opts['charset'])
 176              : $this->_origCharset;
 177  
 178          if (empty($opts['metacharset'])) {
 179              $text = $this->dom->saveHTML();
 180          } else {
 181              /* Add placeholder for META tag. Can't add charset yet because DOM
 182               * extension will alter output if it exists. */
 183              $meta = $this->dom->createElement('meta');
 184              $meta->setAttribute('http-equiv', 'content-type');
 185              $meta->setAttribute('horde_dom_html_charset', '');
 186  
 187              $head = $this->getHead();
 188              $head->insertBefore($meta, $head->firstChild);
 189  
 190              $text = str_replace(
 191                  'horde_dom_html_charset=""',
 192                  'content="text/html; charset=' . $charset . '"',
 193                  $this->dom->saveHTML()
 194              );
 195  
 196              $head->removeChild($meta);
 197          }
 198  
 199          if (strcasecmp($curr_charset, $charset) !== 0) {
 200              $text = Horde_String::convertCharset($text, $curr_charset, $charset);
 201          }
 202  
 203          if (!$this->_xmlencoding ||
 204              (($pos = strpos($text, $this->_xmlencoding)) === false)) {
 205              return $text;
 206          }
 207  
 208          return substr_replace($text, '', $pos, strlen($this->_xmlencoding));
 209      }
 210  
 211      /**
 212       * Returns the body text in the original charset.
 213       *
 214       * @return string  HTML text.
 215       */
 216      public function returnBody()
 217      {
 218          $body = $this->getBody();
 219          $text = '';
 220  
 221          if ($body->hasChildNodes()) {
 222              foreach ($body->childNodes as $child) {
 223                  $text .= $this->dom->saveXML($child);
 224              }
 225          }
 226  
 227          return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset);
 228      }
 229  
 230      /**
 231       * Get the charset of the DOM data.
 232       *
 233       * @since 2.1.0
 234       *
 235       * @return string  Charset of DOM data.
 236       */
 237      public function getCharset()
 238      {
 239          return $this->dom->encoding
 240              ? $this->dom->encoding
 241              : ($this->_xmlencoding ? 'UTF-8' : $this->_origCharset);
 242      }
 243  
 244      /**
 245       * Loads the HTML data.
 246       *
 247       * @param string $html  HTML data.
 248       */
 249      protected function _loadHTML($html)
 250      {
 251          if (version_compare(PHP_VERSION, '5.4', '>=')) {
 252              $mask = defined('LIBXML_PARSEHUGE')
 253                  ? LIBXML_PARSEHUGE
 254                  : 0;
 255              $mask |= defined('LIBXML_COMPACT')
 256                  ? LIBXML_COMPACT
 257                  : 0;
 258              $this->dom->loadHTML($html, $mask);
 259          } else {
 260              $this->dom->loadHTML($html);
 261          }
 262      }
 263  
 264      /* Iterator methods. */
 265  
 266      /**
 267       */
 268      public function current()
 269      {
 270          if ($this->_iterator instanceof DOMDocument) {
 271              return $this->_iterator;
 272          }
 273  
 274          $curr = end($this->_iterator);
 275          return $curr['list']->item($curr['i']);
 276      }
 277  
 278      /**
 279       */
 280      public function key()
 281      {
 282          return 0;
 283      }
 284  
 285      /**
 286       */
 287      public function next()
 288      {
 289          /* Iterate in the reverse direction through the node list. This allows
 290           * alteration of the original list without breaking things (foreach()
 291           * w/removeChild() may exit iteration after removal is complete. */
 292  
 293          if ($this->_iterator instanceof DOMDocument) {
 294              $this->_iterator = array();
 295              $curr = array();
 296              $node = $this->dom;
 297          } elseif (empty($this->_iterator)) {
 298              $this->_iterator = null;
 299              return;
 300          } else {
 301              $curr = &$this->_iterator[count($this->_iterator) - 1];
 302              $node = $curr['list']->item($curr['i']);
 303          }
 304  
 305          if (empty($curr['child']) &&
 306              ($node instanceof DOMNode) &&
 307              $node->hasChildNodes()) {
 308              $curr['child'] = true;
 309              $this->_iterator[] = array(
 310                  'child' => false,
 311                  'i' => $node->childNodes->length - 1,
 312                  'list' => $node->childNodes
 313              );
 314          } elseif (--$curr['i'] < 0) {
 315              array_pop($this->_iterator);
 316              $this->next();
 317          } else {
 318              $curr['child'] = false;
 319          }
 320      }
 321  
 322      /**
 323       */
 324      public function rewind()
 325      {
 326          $this->_iterator = $this->dom;
 327      }
 328  
 329      /**
 330       */
 331      public function valid()
 332      {
 333          return !is_null($this->_iterator);
 334      }
 335  
 336  }