Differences Between: [Versions 311 and 401] [Versions 311 and 402] [Versions 311 and 403]
1 <?php 2 /** 3 * Copyright 2010-2017 Horde LLC (http://www.horde.org/) 4 * 5 * See the enclosed file LICENSE for license information (LGPL). If you 6 * did not receive this file, see http://www.horde.org/licenses/lgpl21. 7 * 8 * @category Horde 9 * @copyright 2010-2017 Horde LLC 10 * @package Util 11 * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1 12 */ 13 14 /** 15 * Parse DOM data from HTML strings. 16 * 17 * @author Michael Slusarz <slusarz@horde.org> 18 * @category Horde 19 * @copyright 2010-2017 Horde LLC 20 * @package Util 21 * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1 22 */ 23 class Horde_Domhtml implements Iterator 24 { 25 /** 26 * DOM object. 27 * 28 * @var DOMDocument 29 */ 30 public $dom; 31 32 /** 33 * Iterator status. 34 * 35 * @var array 36 */ 37 protected $_iterator = null; 38 39 /** 40 * Original charset of data. 41 * 42 * @var string 43 */ 44 protected $_origCharset; 45 46 /** 47 * Encoding tag added to beginning of output. 48 * 49 * @var string 50 */ 51 protected $_xmlencoding = ''; 52 53 /** 54 * Constructor. 55 * 56 * @param string $text The text of the HTML document. 57 * @param string $charset The charset of the HTML document. 58 * 59 * @throws Exception 60 */ 61 public function __construct($text, $charset = null) 62 { 63 if (!extension_loaded('dom')) { 64 throw new Exception('DOM extension is not available.'); 65 } 66 67 // Bug #9616: Make sure we have valid HTML input. 68 if (!strlen($text)) { 69 $text = '<html></html>'; 70 } 71 72 $old_error = libxml_use_internal_errors(true); 73 $this->dom = new DOMDocument(); 74 75 if (is_null($charset)) { 76 /* If no charset given, charset is whatever libxml tells us the 77 * encoding should be defaulting to 'iso-8859-1'. */ 78 $this->_loadHTML($text); 79 $this->_origCharset = $this->dom->encoding 80 ? $this->dom->encoding 81 : 'iso-8859-1'; 82 } else { 83 /* Convert/try with UTF-8 first. */ 84 $this->_origCharset = Horde_String::lower($charset); 85 $this->_xmlencoding = '<?xml encoding="UTF-8"?>'; 86 $this->_loadHTML( 87 $this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8') 88 ); 89 90 if ($this->dom->encoding && 91 (Horde_String::lower($this->dom->encoding) != 'utf-8')) { 92 /* Convert charset to what the HTML document says it SHOULD 93 * be. */ 94 $this->_loadHTML( 95 Horde_String::convertCharset($text, $charset, $this->dom->encoding) 96 ); 97 $this->_xmlencoding = ''; 98 } 99 } 100 101 if ($old_error) { 102 libxml_use_internal_errors(false); 103 } 104 105 /* Sanity checking: make sure we have the documentElement object. */ 106 if (!$this->dom->documentElement) { 107 $this->dom->appendChild($this->dom->createElement('html')); 108 } 109 110 /* Remove old charset information. */ 111 $xpath = new DOMXPath($this->dom); 112 $domlist = $xpath->query('/html/head/meta[@http-equiv="content-type"]'); 113 for ($i = $domlist->length; $i > 0; --$i) { 114 $meta = $domlist->item($i - 1); 115 $meta->parentNode->removeChild($meta); 116 } 117 } 118 119 /** 120 * Returns the HEAD element, or creates one if it doesn't exist. 121 * 122 * @return DOMElement HEAD element. 123 */ 124 public function getHead() 125 { 126 $head = $this->dom->getElementsByTagName('head'); 127 if ($head->length) { 128 return $head->item(0); 129 } 130 131 $headelt = $this->dom->createElement('head'); 132 $this->dom->documentElement->insertBefore($headelt, $this->dom->documentElement->firstChild); 133 134 return $headelt; 135 } 136 137 /** 138 * Returns the BODY element, or creates one if it doesn't exist. 139 * 140 * @since 2.2.0 141 * 142 * @return DOMElement BODY element. 143 */ 144 public function getBody() 145 { 146 $body = $this->dom->getElementsByTagName('body'); 147 if ($body->length) { 148 return $body->item(0); 149 } 150 151 $bodyelt = $this->dom->createElement('body'); 152 $this->dom->documentElement->appendChild($bodyelt); 153 154 return $bodyelt; 155 } 156 157 /** 158 * Returns the full HTML text in the original charset. 159 * 160 * @param array $opts Additional options: (since 2.1.0) 161 * - charset: (string) Return using this charset. If set but empty, will 162 * return as currently stored in the DOM object. 163 * - metacharset: (boolean) If true, will add a META tag containing the 164 * charset information. 165 * 166 * @return string HTML text. 167 */ 168 public function returnHtml(array $opts = array()) 169 { 170 $curr_charset = $this->getCharset(); 171 if (strcasecmp($curr_charset, 'US-ASCII') === 0) { 172 $curr_charset = 'UTF-8'; 173 } 174 $charset = array_key_exists('charset', $opts) 175 ? (empty($opts['charset']) ? $curr_charset : $opts['charset']) 176 : $this->_origCharset; 177 178 if (empty($opts['metacharset'])) { 179 $text = $this->dom->saveHTML(); 180 } else { 181 /* Add placeholder for META tag. Can't add charset yet because DOM 182 * extension will alter output if it exists. */ 183 $meta = $this->dom->createElement('meta'); 184 $meta->setAttribute('http-equiv', 'content-type'); 185 $meta->setAttribute('horde_dom_html_charset', ''); 186 187 $head = $this->getHead(); 188 $head->insertBefore($meta, $head->firstChild); 189 190 $text = str_replace( 191 'horde_dom_html_charset=""', 192 'content="text/html; charset=' . $charset . '"', 193 $this->dom->saveHTML() 194 ); 195 196 $head->removeChild($meta); 197 } 198 199 if (strcasecmp($curr_charset, $charset) !== 0) { 200 $text = Horde_String::convertCharset($text, $curr_charset, $charset); 201 } 202 203 if (!$this->_xmlencoding || 204 (($pos = strpos($text, $this->_xmlencoding)) === false)) { 205 return $text; 206 } 207 208 return substr_replace($text, '', $pos, strlen($this->_xmlencoding)); 209 } 210 211 /** 212 * Returns the body text in the original charset. 213 * 214 * @return string HTML text. 215 */ 216 public function returnBody() 217 { 218 $body = $this->getBody(); 219 $text = ''; 220 221 if ($body->hasChildNodes()) { 222 foreach ($body->childNodes as $child) { 223 $text .= $this->dom->saveXML($child); 224 } 225 } 226 227 return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset); 228 } 229 230 /** 231 * Get the charset of the DOM data. 232 * 233 * @since 2.1.0 234 * 235 * @return string Charset of DOM data. 236 */ 237 public function getCharset() 238 { 239 return $this->dom->encoding 240 ? $this->dom->encoding 241 : ($this->_xmlencoding ? 'UTF-8' : $this->_origCharset); 242 } 243 244 /** 245 * Loads the HTML data. 246 * 247 * @param string $html HTML data. 248 */ 249 protected function _loadHTML($html) 250 { 251 if (version_compare(PHP_VERSION, '5.4', '>=')) { 252 $mask = defined('LIBXML_PARSEHUGE') 253 ? LIBXML_PARSEHUGE 254 : 0; 255 $mask |= defined('LIBXML_COMPACT') 256 ? LIBXML_COMPACT 257 : 0; 258 $this->dom->loadHTML($html, $mask); 259 } else { 260 $this->dom->loadHTML($html); 261 } 262 } 263 264 /* Iterator methods. */ 265 266 /** 267 */ 268 public function current() 269 { 270 if ($this->_iterator instanceof DOMDocument) { 271 return $this->_iterator; 272 } 273 274 $curr = end($this->_iterator); 275 return $curr['list']->item($curr['i']); 276 } 277 278 /** 279 */ 280 public function key() 281 { 282 return 0; 283 } 284 285 /** 286 */ 287 public function next() 288 { 289 /* Iterate in the reverse direction through the node list. This allows 290 * alteration of the original list without breaking things (foreach() 291 * w/removeChild() may exit iteration after removal is complete. */ 292 293 if ($this->_iterator instanceof DOMDocument) { 294 $this->_iterator = array(); 295 $curr = array(); 296 $node = $this->dom; 297 } elseif (empty($this->_iterator)) { 298 $this->_iterator = null; 299 return; 300 } else { 301 $curr = &$this->_iterator[count($this->_iterator) - 1]; 302 $node = $curr['list']->item($curr['i']); 303 } 304 305 if (empty($curr['child']) && 306 ($node instanceof DOMNode) && 307 $node->hasChildNodes()) { 308 $curr['child'] = true; 309 $this->_iterator[] = array( 310 'child' => false, 311 'i' => $node->childNodes->length - 1, 312 'list' => $node->childNodes 313 ); 314 } elseif (--$curr['i'] < 0) { 315 array_pop($this->_iterator); 316 $this->next(); 317 } else { 318 $curr['child'] = false; 319 } 320 } 321 322 /** 323 */ 324 public function rewind() 325 { 326 $this->_iterator = $this->dom; 327 } 328 329 /** 330 */ 331 public function valid() 332 { 333 return !is_null($this->_iterator); 334 } 335 336 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body