Differences Between: [Versions 310 and 402] [Versions 311 and 402] [Versions 39 and 402] [Versions 400 and 402]
1 <?php 2 3 /** 4 * Forgivingly lexes HTML (SGML-style) markup into tokens. 5 * 6 * A lexer parses a string of SGML-style markup and converts them into 7 * corresponding tokens. It doesn't check for well-formedness, although its 8 * internal mechanism may make this automatic (such as the case of 9 * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose 10 * from. 11 * 12 * A lexer is HTML-oriented: it might work with XML, but it's not 13 * recommended, as we adhere to a subset of the specification for optimization 14 * reasons. This might change in the future. Also, most tokenizers are not 15 * expected to handle DTDs or PIs. 16 * 17 * This class should not be directly instantiated, but you may use create() to 18 * retrieve a default copy of the lexer. Being a supertype, this class 19 * does not actually define any implementation, but offers commonly used 20 * convenience functions for subclasses. 21 * 22 * @note The unit tests will instantiate this class for testing purposes, as 23 * many of the utility functions require a class to be instantiated. 24 * This means that, even though this class is not runnable, it will 25 * not be declared abstract. 26 * 27 * @par 28 * 29 * @note 30 * We use tokens rather than create a DOM representation because DOM would: 31 * 32 * @par 33 * -# Require more processing and memory to create, 34 * -# Is not streamable, and 35 * -# Has the entire document structure (html and body not needed). 36 * 37 * @par 38 * However, DOM is helpful in that it makes it easy to move around nodes 39 * without a lot of lookaheads to see when a tag is closed. This is a 40 * limitation of the token system and some workarounds would be nice. 41 */ 42 class HTMLPurifier_Lexer 43 { 44 45 /** 46 * Whether or not this lexer implements line-number/column-number tracking. 47 * If it does, set to true. 48 */ 49 public $tracksLineNumbers = false; 50 51 /** 52 * @type HTMLPurifier_EntityParser 53 */ 54 private $_entity_parser; 55 56 // -- STATIC ---------------------------------------------------------- 57 58 /** 59 * Retrieves or sets the default Lexer as a Prototype Factory. 60 * 61 * By default HTMLPurifier_Lexer_DOMLex will be returned. There are 62 * a few exceptions involving special features that only DirectLex 63 * implements. 64 * 65 * @note The behavior of this class has changed, rather than accepting 66 * a prototype object, it now accepts a configuration object. 67 * To specify your own prototype, set %Core.LexerImpl to it. 68 * This change in behavior de-singletonizes the lexer object. 69 * 70 * @param HTMLPurifier_Config $config 71 * @return HTMLPurifier_Lexer 72 * @throws HTMLPurifier_Exception 73 */ 74 public static function create($config) 75 { 76 if (!($config instanceof HTMLPurifier_Config)) { 77 $lexer = $config; 78 trigger_error( 79 "Passing a prototype to 80 HTMLPurifier_Lexer::create() is deprecated, please instead 81 use %Core.LexerImpl", 82 E_USER_WARNING 83 ); 84 } else { 85 $lexer = $config->get('Core.LexerImpl'); 86 } 87 88 $needs_tracking = 89 $config->get('Core.MaintainLineNumbers') || 90 $config->get('Core.CollectErrors'); 91 92 $inst = null; 93 if (is_object($lexer)) { 94 $inst = $lexer; 95 } else { 96 if (is_null($lexer)) { 97 do { 98 // auto-detection algorithm 99 if ($needs_tracking) { 100 $lexer = 'DirectLex'; 101 break; 102 } 103 104 if (class_exists('DOMDocument', false) && 105 method_exists('DOMDocument', 'loadHTML') && 106 !extension_loaded('domxml') 107 ) { 108 // check for DOM support, because while it's part of the 109 // core, it can be disabled compile time. Also, the PECL 110 // domxml extension overrides the default DOM, and is evil 111 // and nasty and we shan't bother to support it 112 $lexer = 'DOMLex'; 113 } else { 114 $lexer = 'DirectLex'; 115 } 116 } while (0); 117 } // do..while so we can break 118 119 // instantiate recognized string names 120 switch ($lexer) { 121 case 'DOMLex': 122 $inst = new HTMLPurifier_Lexer_DOMLex(); 123 break; 124 case 'DirectLex': 125 $inst = new HTMLPurifier_Lexer_DirectLex(); 126 break; 127 case 'PH5P': 128 $inst = new HTMLPurifier_Lexer_PH5P(); 129 break; 130 default: 131 throw new HTMLPurifier_Exception( 132 "Cannot instantiate unrecognized Lexer type " . 133 htmlspecialchars($lexer) 134 ); 135 } 136 } 137 138 if (!$inst) { 139 throw new HTMLPurifier_Exception('No lexer was instantiated'); 140 } 141 142 // once PHP DOM implements native line numbers, or we 143 // hack out something using XSLT, remove this stipulation 144 if ($needs_tracking && !$inst->tracksLineNumbers) { 145 throw new HTMLPurifier_Exception( 146 'Cannot use lexer that does not support line numbers with ' . 147 'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)' 148 ); 149 } 150 151 return $inst; 152 153 } 154 155 // -- CONVENIENCE MEMBERS --------------------------------------------- 156 157 public function __construct() 158 { 159 $this->_entity_parser = new HTMLPurifier_EntityParser(); 160 } 161 162 /** 163 * Most common entity to raw value conversion table for special entities. 164 * @type array 165 */ 166 protected $_special_entity2str = 167 array( 168 '"' => '"', 169 '&' => '&', 170 '<' => '<', 171 '>' => '>', 172 ''' => "'", 173 ''' => "'", 174 ''' => "'" 175 ); 176 177 public function parseText($string, $config) { 178 return $this->parseData($string, false, $config); 179 } 180 181 public function parseAttr($string, $config) { 182 return $this->parseData($string, true, $config); 183 } 184 185 /** 186 * Parses special entities into the proper characters. 187 * 188 * This string will translate escaped versions of the special characters 189 * into the correct ones. 190 * 191 * @param string $string String character data to be parsed. 192 * @return string Parsed character data. 193 */ 194 public function parseData($string, $is_attr, $config) 195 { 196 // following functions require at least one character 197 if ($string === '') { 198 return ''; 199 } 200 201 // subtracts amps that cannot possibly be escaped 202 $num_amp = substr_count($string, '&') - substr_count($string, '& ') - 203 ($string[strlen($string) - 1] === '&' ? 1 : 0); 204 205 if (!$num_amp) { 206 return $string; 207 } // abort if no entities 208 $num_esc_amp = substr_count($string, '&'); 209 $string = strtr($string, $this->_special_entity2str); 210 211 // code duplication for sake of optimization, see above 212 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - 213 ($string[strlen($string) - 1] === '&' ? 1 : 0); 214 215 if ($num_amp_2 <= $num_esc_amp) { 216 return $string; 217 } 218 219 // hmm... now we have some uncommon entities. Use the callback. 220 if ($config->get('Core.LegacyEntityDecoder')) { 221 $string = $this->_entity_parser->substituteSpecialEntities($string); 222 } else { 223 if ($is_attr) { 224 $string = $this->_entity_parser->substituteAttrEntities($string); 225 } else { 226 $string = $this->_entity_parser->substituteTextEntities($string); 227 } 228 } 229 return $string; 230 } 231 232 /** 233 * Lexes an HTML string into tokens. 234 * @param $string String HTML. 235 * @param HTMLPurifier_Config $config 236 * @param HTMLPurifier_Context $context 237 * @return HTMLPurifier_Token[] array representation of HTML. 238 */ 239 public function tokenizeHTML($string, $config, $context) 240 { 241 trigger_error('Call to abstract class', E_USER_ERROR); 242 } 243 244 /** 245 * Translates CDATA sections into regular sections (through escaping). 246 * @param string $string HTML string to process. 247 * @return string HTML with CDATA sections escaped. 248 */ 249 protected static function escapeCDATA($string) 250 { 251 return preg_replace_callback( 252 '/<!\[CDATA\[(.+?)\]\]>/s', 253 array('HTMLPurifier_Lexer', 'CDATACallback'), 254 $string 255 ); 256 } 257 258 /** 259 * Special CDATA case that is especially convoluted for <script> 260 * @param string $string HTML string to process. 261 * @return string HTML with CDATA sections escaped. 262 */ 263 protected static function escapeCommentedCDATA($string) 264 { 265 return preg_replace_callback( 266 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', 267 array('HTMLPurifier_Lexer', 'CDATACallback'), 268 $string 269 ); 270 } 271 272 /** 273 * Special Internet Explorer conditional comments should be removed. 274 * @param string $string HTML string to process. 275 * @return string HTML with conditional comments removed. 276 */ 277 protected static function removeIEConditional($string) 278 { 279 return preg_replace( 280 '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings 281 '', 282 $string 283 ); 284 } 285 286 /** 287 * Callback function for escapeCDATA() that does the work. 288 * 289 * @warning Though this is public in order to let the callback happen, 290 * calling it directly is not recommended. 291 * @param array $matches PCRE matches array, with index 0 the entire match 292 * and 1 the inside of the CDATA section. 293 * @return string Escaped internals of the CDATA section. 294 */ 295 protected static function CDATACallback($matches) 296 { 297 // not exactly sure why the character set is needed, but whatever 298 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); 299 } 300 301 /** 302 * Takes a piece of HTML and normalizes it by converting entities, fixing 303 * encoding, extracting bits, and other good stuff. 304 * @param string $html HTML. 305 * @param HTMLPurifier_Config $config 306 * @param HTMLPurifier_Context $context 307 * @return string 308 * @todo Consider making protected 309 */ 310 public function normalize($html, $config, $context) 311 { 312 // normalize newlines to \n 313 if ($config->get('Core.NormalizeNewlines')) { 314 $html = str_replace("\r\n", "\n", (string)$html); 315 $html = str_replace("\r", "\n", (string)$html); 316 } 317 318 if ($config->get('HTML.Trusted')) { 319 // escape convoluted CDATA 320 $html = $this->escapeCommentedCDATA($html); 321 } 322 323 // escape CDATA 324 $html = $this->escapeCDATA($html); 325 326 $html = $this->removeIEConditional($html); 327 328 // extract body from document if applicable 329 if ($config->get('Core.ConvertDocumentToFragment')) { 330 $e = false; 331 if ($config->get('Core.CollectErrors')) { 332 $e =& $context->get('ErrorCollector'); 333 } 334 $new_html = $this->extractBody($html); 335 if ($e && $new_html != $html) { 336 $e->send(E_WARNING, 'Lexer: Extracted body'); 337 } 338 $html = $new_html; 339 } 340 341 // expand entities that aren't the big five 342 if ($config->get('Core.LegacyEntityDecoder')) { 343 $html = $this->_entity_parser->substituteNonSpecialEntities($html); 344 } 345 346 // clean into wellformed UTF-8 string for an SGML context: this has 347 // to be done after entity expansion because the entities sometimes 348 // represent non-SGML characters (horror, horror!) 349 $html = HTMLPurifier_Encoder::cleanUTF8($html); 350 351 // if processing instructions are to removed, remove them now 352 if ($config->get('Core.RemoveProcessingInstructions')) { 353 $html = preg_replace('#<\?.+?\?>#s', '', $html); 354 } 355 356 $hidden_elements = $config->get('Core.HiddenElements'); 357 if ($config->get('Core.AggressivelyRemoveScript') && 358 !($config->get('HTML.Trusted') || !$config->get('Core.RemoveScriptContents') 359 || empty($hidden_elements["script"]))) { 360 $html = preg_replace('#<script[^>]*>.*?</script>#i', '', $html); 361 } 362 363 return $html; 364 } 365 366 /** 367 * Takes a string of HTML (fragment or document) and returns the content 368 * @todo Consider making protected 369 */ 370 public function extractBody($html) 371 { 372 $matches = array(); 373 $result = preg_match('|(.*?)<body[^>]*>(.*)</body>|is', $html, $matches); 374 if ($result) { 375 // Make sure it's not in a comment 376 $comment_start = strrpos($matches[1], '<!--'); 377 $comment_end = strrpos($matches[1], '-->'); 378 if ($comment_start === false || 379 ($comment_end !== false && $comment_end > $comment_start)) { 380 return $matches[2]; 381 } 382 } 383 return $html; 384 } 385 } 386 387 // vim: et sw=4 sts=4
title
Description
Body
title
Description
Body
title
Description
Body
title
Body