1 <?php 2 3 /** 4 * Generates HTML from tokens. 5 * @todo Refactor interface so that configuration/context is determined 6 * upon instantiation, no need for messy generateFromTokens() calls 7 * @todo Make some of the more internal functions protected, and have 8 * unit tests work around that 9 */ 10 class HTMLPurifier_Generator 11 { 12 13 /** 14 * Whether or not generator should produce XML output. 15 * @type bool 16 */ 17 private $_xhtml = true; 18 19 /** 20 * :HACK: Whether or not generator should comment the insides of <script> tags. 21 * @type bool 22 */ 23 private $_scriptFix = false; 24 25 /** 26 * Cache of HTMLDefinition during HTML output to determine whether or 27 * not attributes should be minimized. 28 * @type HTMLPurifier_HTMLDefinition 29 */ 30 private $_def; 31 32 /** 33 * Cache of %Output.SortAttr. 34 * @type bool 35 */ 36 private $_sortAttr; 37 38 /** 39 * Cache of %Output.FlashCompat. 40 * @type bool 41 */ 42 private $_flashCompat; 43 44 /** 45 * Cache of %Output.FixInnerHTML. 46 * @type bool 47 */ 48 private $_innerHTMLFix; 49 50 /** 51 * Stack for keeping track of object information when outputting IE 52 * compatibility code. 53 * @type array 54 */ 55 private $_flashStack = array(); 56 57 /** 58 * Configuration for the generator 59 * @type HTMLPurifier_Config 60 */ 61 protected $config; 62 63 /** 64 * @param HTMLPurifier_Config $config 65 * @param HTMLPurifier_Context $context 66 */ 67 public function __construct($config, $context) 68 { 69 $this->config = $config; 70 $this->_scriptFix = $config->get('Output.CommentScriptContents'); 71 $this->_innerHTMLFix = $config->get('Output.FixInnerHTML'); 72 $this->_sortAttr = $config->get('Output.SortAttr'); 73 $this->_flashCompat = $config->get('Output.FlashCompat'); 74 $this->_def = $config->getHTMLDefinition(); 75 $this->_xhtml = $this->_def->doctype->xml; 76 } 77 78 /** 79 * Generates HTML from an array of tokens. 80 * @param HTMLPurifier_Token[] $tokens Array of HTMLPurifier_Token 81 * @return string Generated HTML 82 */ 83 public function generateFromTokens($tokens) 84 { 85 if (!$tokens) { 86 return ''; 87 } 88 89 // Basic algorithm 90 $html = ''; 91 for ($i = 0, $size = count($tokens); $i < $size; $i++) { 92 if ($this->_scriptFix && $tokens[$i]->name === 'script' 93 && $i + 2 < $size && $tokens[$i+2] instanceof HTMLPurifier_Token_End) { 94 // script special case 95 // the contents of the script block must be ONE token 96 // for this to work. 97 $html .= $this->generateFromToken($tokens[$i++]); 98 $html .= $this->generateScriptFromToken($tokens[$i++]); 99 } 100 $html .= $this->generateFromToken($tokens[$i]); 101 } 102 103 // Tidy cleanup 104 if (extension_loaded('tidy') && $this->config->get('Output.TidyFormat')) { 105 $tidy = new Tidy; 106 $tidy->parseString( 107 $html, 108 array( 109 'indent'=> true, 110 'output-xhtml' => $this->_xhtml, 111 'show-body-only' => true, 112 'indent-spaces' => 2, 113 'wrap' => 68, 114 ), 115 'utf8' 116 ); 117 $tidy->cleanRepair(); 118 $html = (string) $tidy; // explicit cast necessary 119 } 120 121 // Normalize newlines to system defined value 122 if ($this->config->get('Core.NormalizeNewlines')) { 123 $nl = $this->config->get('Output.Newline'); 124 if ($nl === null) { 125 $nl = PHP_EOL; 126 } 127 if ($nl !== "\n") { 128 $html = str_replace("\n", $nl, $html); 129 } 130 } 131 return $html; 132 } 133 134 /** 135 * Generates HTML from a single token. 136 * @param HTMLPurifier_Token $token HTMLPurifier_Token object. 137 * @return string Generated HTML 138 */ 139 public function generateFromToken($token) 140 { 141 if (!$token instanceof HTMLPurifier_Token) { 142 trigger_error('Cannot generate HTML from non-HTMLPurifier_Token object', E_USER_WARNING); 143 return ''; 144 145 } elseif ($token instanceof HTMLPurifier_Token_Start) { 146 $attr = $this->generateAttributes($token->attr, $token->name); 147 if ($this->_flashCompat) { 148 if ($token->name == "object") { 149 $flash = new stdClass(); 150 $flash->attr = $token->attr; 151 $flash->param = array(); 152 $this->_flashStack[] = $flash; 153 } 154 } 155 return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; 156 157 } elseif ($token instanceof HTMLPurifier_Token_End) { 158 $_extra = ''; 159 if ($this->_flashCompat) { 160 if ($token->name == "object" && !empty($this->_flashStack)) { 161 // doesn't do anything for now 162 } 163 } 164 return $_extra . '</' . $token->name . '>'; 165 166 } elseif ($token instanceof HTMLPurifier_Token_Empty) { 167 if ($this->_flashCompat && $token->name == "param" && !empty($this->_flashStack)) { 168 $this->_flashStack[count($this->_flashStack)-1]->param[$token->attr['name']] = $token->attr['value']; 169 } 170 $attr = $this->generateAttributes($token->attr, $token->name); 171 return '<' . $token->name . ($attr ? ' ' : '') . $attr . 172 ( $this->_xhtml ? ' /': '' ) // <br /> v. <br> 173 . '>'; 174 175 } elseif ($token instanceof HTMLPurifier_Token_Text) { 176 return $this->escape($token->data, ENT_NOQUOTES); 177 178 } elseif ($token instanceof HTMLPurifier_Token_Comment) { 179 return '<!--' . $token->data . '-->'; 180 } else { 181 return ''; 182 183 } 184 } 185 186 /** 187 * Special case processor for the contents of script tags 188 * @param HTMLPurifier_Token $token HTMLPurifier_Token object. 189 * @return string 190 * @warning This runs into problems if there's already a literal 191 * --> somewhere inside the script contents. 192 */ 193 public function generateScriptFromToken($token) 194 { 195 if (!$token instanceof HTMLPurifier_Token_Text) { 196 return $this->generateFromToken($token); 197 } 198 // Thanks <http://lachy.id.au/log/2005/05/script-comments> 199 $data = preg_replace('#//\s*$#', '', $token->data); 200 return '<!--//--><![CDATA[//><!--' . "\n" . trim($data) . "\n" . '//--><!]]>'; 201 } 202 203 /** 204 * Generates attribute declarations from attribute array. 205 * @note This does not include the leading or trailing space. 206 * @param array $assoc_array_of_attributes Attribute array 207 * @param string $element Name of element attributes are for, used to check 208 * attribute minimization. 209 * @return string Generated HTML fragment for insertion. 210 */ 211 public function generateAttributes($assoc_array_of_attributes, $element = '') 212 { 213 $html = ''; 214 if ($this->_sortAttr) { 215 ksort($assoc_array_of_attributes); 216 } 217 foreach ($assoc_array_of_attributes as $key => $value) { 218 if (!$this->_xhtml) { 219 // Remove namespaced attributes 220 if (strpos($key, ':') !== false) { 221 continue; 222 } 223 // Check if we should minimize the attribute: val="val" -> val 224 if ($element && !empty($this->_def->info[$element]->attr[$key]->minimized)) { 225 $html .= $key . ' '; 226 continue; 227 } 228 } 229 // Workaround for Internet Explorer innerHTML bug. 230 // Essentially, Internet Explorer, when calculating 231 // innerHTML, omits quotes if there are no instances of 232 // angled brackets, quotes or spaces. However, when parsing 233 // HTML (for example, when you assign to innerHTML), it 234 // treats backticks as quotes. Thus, 235 // <img alt="``" /> 236 // becomes 237 // <img alt=`` /> 238 // becomes 239 // <img alt='' /> 240 // Fortunately, all we need to do is trigger an appropriate 241 // quoting style, which we do by adding an extra space. 242 // This also is consistent with the W3C spec, which states 243 // that user agents may ignore leading or trailing 244 // whitespace (in fact, most don't, at least for attributes 245 // like alt, but an extra space at the end is barely 246 // noticeable). Still, we have a configuration knob for 247 // this, since this transformation is not necesary if you 248 // don't process user input with innerHTML or you don't plan 249 // on supporting Internet Explorer. 250 if ($this->_innerHTMLFix) { 251 if (strpos($value, '`') !== false) { 252 // check if correct quoting style would not already be 253 // triggered 254 if (strcspn($value, '"\' <>') === strlen($value)) { 255 // protect! 256 $value .= ' '; 257 } 258 } 259 } 260 $html .= $key.'="'.$this->escape($value).'" '; 261 } 262 return rtrim($html); 263 } 264 265 /** 266 * Escapes raw text data. 267 * @todo This really ought to be protected, but until we have a facility 268 * for properly generating HTML here w/o using tokens, it stays 269 * public. 270 * @param string $string String data to escape for HTML. 271 * @param int $quote Quoting style, like htmlspecialchars. ENT_NOQUOTES is 272 * permissible for non-attribute output. 273 * @return string escaped data. 274 */ 275 public function escape($string, $quote = null) 276 { 277 // Workaround for APC bug on Mac Leopard reported by sidepodcast 278 // http://htmlpurifier.org/phorum/read.php?3,4823,4846 279 if ($quote === null) { 280 $quote = ENT_COMPAT; 281 } 282 return htmlspecialchars($string, $quote, 'UTF-8'); 283 } 284 } 285 286 // vim: et sw=4 sts=4
title
Description
Body
title
Description
Body
title
Description
Body
title
Body