Differences Between: [Versions 310 and 403] [Versions 311 and 403] [Versions 39 and 403] [Versions 400 and 403]
1 <?php 2 /** 3 * Copyright 2010-2017 Horde LLC (http://www.horde.org/) 4 * 5 * See the enclosed file LICENSE for license information (LGPL). If you 6 * did not receive this file, see http://www.horde.org/licenses/lgpl21. 7 * 8 * @category Horde 9 * @copyright 2010-2017 Horde LLC 10 * @package Util 11 * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1 12 */ 13 14 /** 15 * Parse DOM data from HTML strings. 16 * 17 * @author Michael Slusarz <slusarz@horde.org> 18 * @category Horde 19 * @copyright 2010-2017 Horde LLC 20 * @package Util 21 * @license http://www.horde.org/licenses/lgpl21 LGPL 2.1 22 */ 23 class Horde_Domhtml implements Iterator 24 { 25 /** 26 * DOM object. 27 * 28 * @var DOMDocument 29 */ 30 public $dom; 31 32 /** 33 * Iterator status. 34 * 35 * @var array 36 */ 37 protected $_iterator = null; 38 39 /** 40 * Original charset of data. 41 * 42 * @var string 43 */ 44 protected $_origCharset; 45 46 /** 47 * Encoding tag added to beginning of output. 48 * 49 * @var string 50 */ 51 protected $_xmlencoding = ''; 52 53 /** 54 * Constructor. 55 * 56 * @param string $text The text of the HTML document. 57 * @param string $charset The charset of the HTML document. 58 * 59 * @throws Exception 60 */ 61 public function __construct($text, $charset = null) 62 { 63 if (!extension_loaded('dom')) { 64 throw new Exception('DOM extension is not available.'); 65 } 66 67 // Bug #9616: Make sure we have valid HTML input. 68 if (!strlen($text)) { 69 $text = '<html></html>'; 70 } 71 72 $old_error = libxml_use_internal_errors(true); 73 $this->dom = new DOMDocument(); 74 75 if (is_null($charset)) { 76 /* If no charset given, charset is whatever libxml tells us the 77 * encoding should be defaulting to 'iso-8859-1'. */ 78 $this->_loadHTML($text); 79 $this->_origCharset = $this->dom->encoding 80 ? $this->dom->encoding 81 : 'iso-8859-1'; 82 } else { 83 /* Convert/try with UTF-8 first. */ 84 $this->_origCharset = Horde_String::lower($charset); 85 $this->_xmlencoding = '<?xml encoding="UTF-8"?>'; 86 $this->_loadHTML( 87 $this->_xmlencoding . Horde_String::convertCharset($text, $charset, 'UTF-8') 88 ); 89 90 if ($this->dom->encoding && 91 (Horde_String::lower($this->dom->encoding) != 'utf-8')) { 92 /* Convert charset to what the HTML document says it SHOULD 93 * be. */ 94 $this->_loadHTML( 95 Horde_String::convertCharset($text, $charset, $this->dom->encoding) 96 ); 97 $this->_xmlencoding = ''; 98 } 99 } 100 101 if ($old_error) { 102 libxml_use_internal_errors(false); 103 } 104 105 /* Sanity checking: make sure we have the documentElement object. */ 106 if (!$this->dom->documentElement) { 107 $this->dom->appendChild($this->dom->createElement('html')); 108 } 109 110 /* Remove old charset information. */ 111 $xpath = new DOMXPath($this->dom); 112 $domlist = $xpath->query('/html/head/meta[@http-equiv="content-type"]'); 113 for ($i = $domlist->length; $i > 0; --$i) { 114 $meta = $domlist->item($i - 1); 115 $meta->parentNode->removeChild($meta); 116 } 117 } 118 119 /** 120 * Returns the HEAD element, or creates one if it doesn't exist. 121 * 122 * @return DOMElement HEAD element. 123 */ 124 public function getHead() 125 { 126 $head = $this->dom->getElementsByTagName('head'); 127 if ($head->length) { 128 return $head->item(0); 129 } 130 131 $headelt = $this->dom->createElement('head'); 132 $this->dom->documentElement->insertBefore($headelt, $this->dom->documentElement->firstChild); 133 134 return $headelt; 135 } 136 137 /** 138 * Returns the BODY element, or creates one if it doesn't exist. 139 * 140 * @since 2.2.0 141 * 142 * @return DOMElement BODY element. 143 */ 144 public function getBody() 145 { 146 $body = $this->dom->getElementsByTagName('body'); 147 if ($body->length) { 148 return $body->item(0); 149 } 150 151 $bodyelt = $this->dom->createElement('body'); 152 $this->dom->documentElement->appendChild($bodyelt); 153 154 return $bodyelt; 155 } 156 157 /** 158 * Returns the full HTML text in the original charset. 159 * 160 * @param array $opts Additional options: (since 2.1.0) 161 * - charset: (string) Return using this charset. If set but empty, will 162 * return as currently stored in the DOM object. 163 * - metacharset: (boolean) If true, will add a META tag containing the 164 * charset information. 165 * 166 * @return string HTML text. 167 */ 168 public function returnHtml(array $opts = array()) 169 { 170 $curr_charset = $this->getCharset(); 171 if (strcasecmp($curr_charset, 'US-ASCII') === 0) { 172 $curr_charset = 'UTF-8'; 173 } 174 $charset = array_key_exists('charset', $opts) 175 ? (empty($opts['charset']) ? $curr_charset : $opts['charset']) 176 : $this->_origCharset; 177 178 if (empty($opts['metacharset'])) { 179 $text = $this->dom->saveHTML(); 180 } else { 181 /* Add placeholder for META tag. Can't add charset yet because DOM 182 * extension will alter output if it exists. */ 183 $meta = $this->dom->createElement('meta'); 184 $meta->setAttribute('http-equiv', 'content-type'); 185 $meta->setAttribute('horde_dom_html_charset', ''); 186 187 $head = $this->getHead(); 188 $head->insertBefore($meta, $head->firstChild); 189 190 $text = str_replace( 191 'horde_dom_html_charset=""', 192 'content="text/html; charset=' . $charset . '"', 193 $this->dom->saveHTML() 194 ); 195 196 $head->removeChild($meta); 197 } 198 199 if (strcasecmp($curr_charset, $charset) !== 0) { 200 $text = Horde_String::convertCharset($text, $curr_charset, $charset); 201 } 202 203 if (!$this->_xmlencoding || 204 (($pos = strpos($text, $this->_xmlencoding)) === false)) { 205 return $text; 206 } 207 208 return substr_replace($text, '', $pos, strlen($this->_xmlencoding)); 209 } 210 211 /** 212 * Returns the body text in the original charset. 213 * 214 * @return string HTML text. 215 */ 216 public function returnBody() 217 { 218 $body = $this->getBody(); 219 $text = ''; 220 221 if ($body->hasChildNodes()) { 222 foreach ($body->childNodes as $child) { 223 $text .= $this->dom->saveXML($child); 224 } 225 } 226 227 return Horde_String::convertCharset($text, 'UTF-8', $this->_origCharset); 228 } 229 230 /** 231 * Get the charset of the DOM data. 232 * 233 * @since 2.1.0 234 * 235 * @return string Charset of DOM data. 236 */ 237 public function getCharset() 238 { 239 return $this->dom->encoding 240 ? $this->dom->encoding 241 : ($this->_xmlencoding ? 'UTF-8' : $this->_origCharset); 242 } 243 244 /** 245 * Loads the HTML data. 246 * 247 * @param string $html HTML data. 248 */ 249 protected function _loadHTML($html) 250 { 251 if (version_compare(PHP_VERSION, '5.4', '>=')) { 252 $mask = defined('LIBXML_PARSEHUGE') 253 ? LIBXML_PARSEHUGE 254 : 0; 255 $mask |= defined('LIBXML_COMPACT') 256 ? LIBXML_COMPACT 257 : 0; 258 $this->dom->loadHTML($html, $mask); 259 } else { 260 $this->dom->loadHTML($html); 261 } 262 } 263 264 /* Iterator methods. */ 265 266 /** 267 */ 268 #[ReturnTypeWillChange] 269 public function current() 270 { 271 if ($this->_iterator instanceof DOMDocument) { 272 return $this->_iterator; 273 } 274 275 $curr = end($this->_iterator); 276 return $curr['list']->item($curr['i']); 277 } 278 279 /** 280 */ 281 #[ReturnTypeWillChange] 282 public function key() 283 { 284 return 0; 285 } 286 287 /** 288 */ 289 #[ReturnTypeWillChange] 290 public function next() 291 { 292 /* Iterate in the reverse direction through the node list. This allows 293 * alteration of the original list without breaking things (foreach() 294 * w/removeChild() may exit iteration after removal is complete. */ 295 296 if ($this->_iterator instanceof DOMDocument) { 297 $this->_iterator = array(); 298 $curr = array(); 299 $node = $this->dom; 300 } elseif (empty($this->_iterator)) { 301 $this->_iterator = null; 302 return; 303 } else { 304 $curr = &$this->_iterator[count($this->_iterator) - 1]; 305 $node = $curr['list']->item($curr['i']); 306 } 307 308 if (empty($curr['child']) && 309 ($node instanceof DOMNode) && 310 $node->hasChildNodes()) { 311 $curr['child'] = true; 312 $this->_iterator[] = array( 313 'child' => false, 314 'i' => $node->childNodes->length - 1, 315 'list' => $node->childNodes 316 ); 317 } elseif (--$curr['i'] < 0) { 318 array_pop($this->_iterator); 319 $this->next(); 320 } else { 321 $curr['child'] = false; 322 } 323 } 324 325 /** 326 */ 327 #[ReturnTypeWillChange] 328 public function rewind() 329 { 330 $this->_iterator = $this->dom; 331 } 332 333 /** 334 */ 335 #[ReturnTypeWillChange] 336 public function valid() 337 { 338 return !is_null($this->_iterator); 339 } 340 341 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body