1 <?php 2 3 /** 4 * Injector that auto paragraphs text in the root node based on 5 * double-spacing. 6 * @todo Ensure all states are unit tested, including variations as well. 7 * @todo Make a graph of the flow control for this Injector. 8 */ 9 class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector 10 { 11 /** 12 * @type string 13 */ 14 public $name = 'AutoParagraph'; 15 16 /** 17 * @type array 18 */ 19 public $needed = array('p'); 20 21 /** 22 * @return HTMLPurifier_Token_Start 23 */ 24 private function _pStart() 25 { 26 $par = new HTMLPurifier_Token_Start('p'); 27 $par->armor['MakeWellFormed_TagClosedError'] = true; 28 return $par; 29 } 30 31 /** 32 * @param HTMLPurifier_Token_Text $token 33 */ 34 public function handleText(&$token) 35 { 36 $text = $token->data; 37 // Does the current parent allow <p> tags? 38 if ($this->allowsElement('p')) { 39 if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) { 40 // Note that we have differing behavior when dealing with text 41 // in the anonymous root node, or a node inside the document. 42 // If the text as a double-newline, the treatment is the same; 43 // if it doesn't, see the next if-block if you're in the document. 44 45 $i = $nesting = null; 46 if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) { 47 // State 1.1: ... ^ (whitespace, then document end) 48 // ---- 49 // This is a degenerate case 50 } else { 51 if (!$token->is_whitespace || $this->_isInline($current)) { 52 // State 1.2: PAR1 53 // ---- 54 55 // State 1.3: PAR1\n\nPAR2 56 // ------------ 57 58 // State 1.4: <div>PAR1\n\nPAR2 (see State 2) 59 // ------------ 60 $token = array($this->_pStart()); 61 $this->_splitText($text, $token); 62 } else { 63 // State 1.5: \n<hr /> 64 // -- 65 } 66 } 67 } else { 68 // State 2: <div>PAR1... (similar to 1.4) 69 // ---- 70 71 // We're in an element that allows paragraph tags, but we're not 72 // sure if we're going to need them. 73 if ($this->_pLookAhead()) { 74 // State 2.1: <div>PAR1<b>PAR1\n\nPAR2 75 // ---- 76 // Note: This will always be the first child, since any 77 // previous inline element would have triggered this very 78 // same routine, and found the double newline. One possible 79 // exception would be a comment. 80 $token = array($this->_pStart(), $token); 81 } else { 82 // State 2.2.1: <div>PAR1<div> 83 // ---- 84 85 // State 2.2.2: <div>PAR1<b>PAR1</b></div> 86 // ---- 87 } 88 } 89 // Is the current parent a <p> tag? 90 } elseif (!empty($this->currentNesting) && 91 $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') { 92 // State 3.1: ...<p>PAR1 93 // ---- 94 95 // State 3.2: ...<p>PAR1\n\nPAR2 96 // ------------ 97 $token = array(); 98 $this->_splitText($text, $token); 99 // Abort! 100 } else { 101 // State 4.1: ...<b>PAR1 102 // ---- 103 104 // State 4.2: ...<b>PAR1\n\nPAR2 105 // ------------ 106 } 107 } 108 109 /** 110 * @param HTMLPurifier_Token $token 111 */ 112 public function handleElement(&$token) 113 { 114 // We don't have to check if we're already in a <p> tag for block 115 // tokens, because the tag would have been autoclosed by MakeWellFormed. 116 if ($this->allowsElement('p')) { 117 if (!empty($this->currentNesting)) { 118 if ($this->_isInline($token)) { 119 // State 1: <div>...<b> 120 // --- 121 // Check if this token is adjacent to the parent token 122 // (seek backwards until token isn't whitespace) 123 $i = null; 124 $this->backward($i, $prev); 125 126 if (!$prev instanceof HTMLPurifier_Token_Start) { 127 // Token wasn't adjacent 128 if ($prev instanceof HTMLPurifier_Token_Text && 129 substr($prev->data, -2) === "\n\n" 130 ) { 131 // State 1.1.4: <div><p>PAR1</p>\n\n<b> 132 // --- 133 // Quite frankly, this should be handled by splitText 134 $token = array($this->_pStart(), $token); 135 } else { 136 // State 1.1.1: <div><p>PAR1</p><b> 137 // --- 138 // State 1.1.2: <div><br /><b> 139 // --- 140 // State 1.1.3: <div>PAR<b> 141 // --- 142 } 143 } else { 144 // State 1.2.1: <div><b> 145 // --- 146 // Lookahead to see if <p> is needed. 147 if ($this->_pLookAhead()) { 148 // State 1.3.1: <div><b>PAR1\n\nPAR2 149 // --- 150 $token = array($this->_pStart(), $token); 151 } else { 152 // State 1.3.2: <div><b>PAR1</b></div> 153 // --- 154 155 // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div> 156 // --- 157 } 158 } 159 } else { 160 // State 2.3: ...<div> 161 // ----- 162 } 163 } else { 164 if ($this->_isInline($token)) { 165 // State 3.1: <b> 166 // --- 167 // This is where the {p} tag is inserted, not reflected in 168 // inputTokens yet, however. 169 $token = array($this->_pStart(), $token); 170 } else { 171 // State 3.2: <div> 172 // ----- 173 } 174 175 $i = null; 176 if ($this->backward($i, $prev)) { 177 if (!$prev instanceof HTMLPurifier_Token_Text) { 178 // State 3.1.1: ...</p>{p}<b> 179 // --- 180 // State 3.2.1: ...</p><div> 181 // ----- 182 if (!is_array($token)) { 183 $token = array($token); 184 } 185 array_unshift($token, new HTMLPurifier_Token_Text("\n\n")); 186 } else { 187 // State 3.1.2: ...</p>\n\n{p}<b> 188 // --- 189 // State 3.2.2: ...</p>\n\n<div> 190 // ----- 191 // Note: PAR<ELEM> cannot occur because PAR would have been 192 // wrapped in <p> tags. 193 } 194 } 195 } 196 } else { 197 // State 2.2: <ul><li> 198 // ---- 199 // State 2.4: <p><b> 200 // --- 201 } 202 } 203 204 /** 205 * Splits up a text in paragraph tokens and appends them 206 * to the result stream that will replace the original 207 * @param string $data String text data that will be processed 208 * into paragraphs 209 * @param HTMLPurifier_Token[] $result Reference to array of tokens that the 210 * tags will be appended onto 211 */ 212 private function _splitText($data, &$result) 213 { 214 $raw_paragraphs = explode("\n\n", $data); 215 $paragraphs = array(); // without empty paragraphs 216 $needs_start = false; 217 $needs_end = false; 218 219 $c = count($raw_paragraphs); 220 if ($c == 1) { 221 // There were no double-newlines, abort quickly. In theory this 222 // should never happen. 223 $result[] = new HTMLPurifier_Token_Text($data); 224 return; 225 } 226 for ($i = 0; $i < $c; $i++) { 227 $par = $raw_paragraphs[$i]; 228 if (trim($par) !== '') { 229 $paragraphs[] = $par; 230 } else { 231 if ($i == 0) { 232 // Double newline at the front 233 if (empty($result)) { 234 // The empty result indicates that the AutoParagraph 235 // injector did not add any start paragraph tokens. 236 // This means that we have been in a paragraph for 237 // a while, and the newline means we should start a new one. 238 $result[] = new HTMLPurifier_Token_End('p'); 239 $result[] = new HTMLPurifier_Token_Text("\n\n"); 240 // However, the start token should only be added if 241 // there is more processing to be done (i.e. there are 242 // real paragraphs in here). If there are none, the 243 // next start paragraph tag will be handled by the 244 // next call to the injector 245 $needs_start = true; 246 } else { 247 // We just started a new paragraph! 248 // Reinstate a double-newline for presentation's sake, since 249 // it was in the source code. 250 array_unshift($result, new HTMLPurifier_Token_Text("\n\n")); 251 } 252 } elseif ($i + 1 == $c) { 253 // Double newline at the end 254 // There should be a trailing </p> when we're finally done. 255 $needs_end = true; 256 } 257 } 258 } 259 260 // Check if this was just a giant blob of whitespace. Move this earlier, 261 // perhaps? 262 if (empty($paragraphs)) { 263 return; 264 } 265 266 // Add the start tag indicated by \n\n at the beginning of $data 267 if ($needs_start) { 268 $result[] = $this->_pStart(); 269 } 270 271 // Append the paragraphs onto the result 272 foreach ($paragraphs as $par) { 273 $result[] = new HTMLPurifier_Token_Text($par); 274 $result[] = new HTMLPurifier_Token_End('p'); 275 $result[] = new HTMLPurifier_Token_Text("\n\n"); 276 $result[] = $this->_pStart(); 277 } 278 279 // Remove trailing start token; Injector will handle this later if 280 // it was indeed needed. This prevents from needing to do a lookahead, 281 // at the cost of a lookbehind later. 282 array_pop($result); 283 284 // If there is no need for an end tag, remove all of it and let 285 // MakeWellFormed close it later. 286 if (!$needs_end) { 287 array_pop($result); // removes \n\n 288 array_pop($result); // removes </p> 289 } 290 } 291 292 /** 293 * Returns true if passed token is inline (and, ergo, allowed in 294 * paragraph tags) 295 * @param HTMLPurifier_Token $token 296 * @return bool 297 */ 298 private function _isInline($token) 299 { 300 return isset($this->htmlDefinition->info['p']->child->elements[$token->name]); 301 } 302 303 /** 304 * Looks ahead in the token list and determines whether or not we need 305 * to insert a <p> tag. 306 * @return bool 307 */ 308 private function _pLookAhead() 309 { 310 if ($this->currentToken instanceof HTMLPurifier_Token_Start) { 311 $nesting = 1; 312 } else { 313 $nesting = 0; 314 } 315 $ok = false; 316 $i = null; 317 while ($this->forwardUntilEndToken($i, $current, $nesting)) { 318 $result = $this->_checkNeedsP($current); 319 if ($result !== null) { 320 $ok = $result; 321 break; 322 } 323 } 324 return $ok; 325 } 326 327 /** 328 * Determines if a particular token requires an earlier inline token 329 * to get a paragraph. This should be used with _forwardUntilEndToken 330 * @param HTMLPurifier_Token $current 331 * @return bool 332 */ 333 private function _checkNeedsP($current) 334 { 335 if ($current instanceof HTMLPurifier_Token_Start) { 336 if (!$this->_isInline($current)) { 337 // <div>PAR1<div> 338 // ---- 339 // Terminate early, since we hit a block element 340 return false; 341 } 342 } elseif ($current instanceof HTMLPurifier_Token_Text) { 343 if (strpos($current->data, "\n\n") !== false) { 344 // <div>PAR1<b>PAR1\n\nPAR2 345 // ---- 346 return true; 347 } else { 348 // <div>PAR1<b>PAR1... 349 // ---- 350 } 351 } 352 return null; 353 } 354 } 355 356 // vim: et sw=4 sts=4
title
Description
Body
title
Description
Body
title
Description
Body
title
Body