Differences Between: [Versions 310 and 311] [Versions 310 and 400] [Versions 310 and 401] [Versions 310 and 402] [Versions 310 and 403]
1 <?php 2 3 /* 4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> 5 * 6 * This script is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * The GNU General Public License can be found at 12 * http://www.gnu.org/copyleft/gpl.html. 13 * 14 * This script is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 namespace Html2Text; 21 22 class Html2Text 23 { 24 const ENCODING = 'UTF-8'; 25 26 protected $htmlFuncFlags; 27 28 /** 29 * Contains the HTML content to convert. 30 * 31 * @type string 32 */ 33 protected $html; 34 35 /** 36 * Contains the converted, formatted text. 37 * 38 * @type string 39 */ 40 protected $text; 41 42 /** 43 * List of preg* regular expression patterns to search for, 44 * used in conjunction with $replace. 45 * 46 * @type array 47 * @see $replace 48 */ 49 protected $search = array( 50 "/\r/", // Non-legal carriage return 51 "/[\n\t]+/", // Newlines and tabs 52 '/<head\b[^>]*>.*?<\/head>/i', // <head> 53 '/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 54 '/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 55 '/<i\b[^>]*>(.*?)<\/i>/i', // <i> 56 '/<em\b[^>]*>(.*?)<\/em>/i', // <em> 57 '/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul> 58 '/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol> 59 '/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl> 60 '/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li> 61 '/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd> 62 '/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt> 63 '/<li\b[^>]*>/i', // <li> 64 '/<hr\b[^>]*>/i', // <hr> 65 '/<div\b[^>]*>/i', // <div> 66 '/(<table\b[^>]*>|<\/table>)/i', // <table> and </table> 67 '/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr> 68 '/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td> 69 '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span> 70 '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag 71 ); 72 73 /** 74 * List of pattern replacements corresponding to patterns searched. 75 * 76 * @type array 77 * @see $search 78 */ 79 protected $replace = array( 80 '', // Non-legal carriage return 81 ' ', // Newlines and tabs 82 '', // <head> 83 '', // <script>s -- which strip_tags supposedly has problems with 84 '', // <style>s -- which strip_tags supposedly has problems with 85 '_\\1_', // <i> 86 '_\\1_', // <em> 87 "\n\n", // <ul> and </ul> 88 "\n\n", // <ol> and </ol> 89 "\n\n", // <dl> and </dl> 90 "\t* \\1\n", // <li> and </li> 91 " \\1\n", // <dd> and </dd> 92 "\t* \\1", // <dt> and </dt> 93 "\n\t* ", // <li> 94 "\n-------------------------\n", // <hr> 95 "<div>\n", // <div> 96 "\n\n", // <table> and </table> 97 "\n", // <tr> and </tr> 98 "\t\t\\1\n", // <td> and </td> 99 "", // <span class="_html2text_ignore">...</span> 100 '[\\2]', // <img> with alt tag 101 ); 102 103 /** 104 * List of preg* regular expression patterns to search for, 105 * used in conjunction with $entReplace. 106 * 107 * @type array 108 * @see $entReplace 109 */ 110 protected $entSearch = array( 111 '/™/i', // TM symbol in win-1252 112 '/—/i', // m-dash in win-1252 113 '/&(amp|#38);/i', // Ampersand: see converter() 114 '/[ ]{2,}/', // Runs of spaces, post-handling 115 '/'/i', // The apostrophe symbol 116 ); 117 118 /** 119 * List of pattern replacements corresponding to patterns searched. 120 * 121 * @type array 122 * @see $entSearch 123 */ 124 protected $entReplace = array( 125 '™', // TM symbol 126 '—', // m-dash 127 '|+|amp|+|', // Ampersand: see converter() 128 ' ', // Runs of spaces, post-handling 129 '\'', // Apostrophe 130 ); 131 132 /** 133 * List of preg* regular expression patterns to search for 134 * and replace using callback function. 135 * 136 * @type array 137 */ 138 protected $callbackSearch = array( 139 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 140 '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace. 141 '/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline. 142 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> 143 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> 144 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> 145 '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href=""> 146 ); 147 148 /** 149 * List of preg* regular expression patterns to search for in PRE body, 150 * used in conjunction with $preReplace. 151 * 152 * @type array 153 * @see $preReplace 154 */ 155 protected $preSearch = array( 156 "/\n/", 157 "/\t/", 158 '/ /', 159 '/<pre[^>]*>/', 160 '/<\/pre>/' 161 ); 162 163 /** 164 * List of pattern replacements corresponding to patterns searched for PRE body. 165 * 166 * @type array 167 * @see $preSearch 168 */ 169 protected $preReplace = array( 170 '<br>', 171 ' ', 172 ' ', 173 '', 174 '', 175 ); 176 177 /** 178 * Temporary workspace used during PRE processing. 179 * 180 * @type string 181 */ 182 protected $preContent = ''; 183 184 /** 185 * Contains the base URL that relative links should resolve to. 186 * 187 * @type string 188 */ 189 protected $baseurl = ''; 190 191 /** 192 * Indicates whether content in the $html variable has been converted yet. 193 * 194 * @type boolean 195 * @see $html, $text 196 */ 197 protected $converted = false; 198 199 /** 200 * Contains URL addresses from links to be rendered in plain text. 201 * 202 * @type array 203 * @see buildlinkList() 204 */ 205 protected $linkList = array(); 206 207 /** 208 * Various configuration options (able to be set in the constructor) 209 * 210 * @type array 211 */ 212 protected $options = array( 213 'do_links' => 'inline', // 'none' 214 // 'inline' (show links inline) 215 // 'nextline' (show links on the next line) 216 // 'table' (if a table of link URLs should be listed after the text. 217 // 'bbcode' (show links as bbcode) 218 219 'width' => 70, // Maximum width of the formatted text, in columns. 220 // Set this value to 0 (or less) to ignore word wrapping 221 // and not constrain text to a fixed-width column. 222 ); 223 224 private function legacyConstruct($html = '', $fromFile = false, array $options = array()) 225 { 226 $this->set_html($html, $fromFile); 227 $this->options = array_merge($this->options, $options); 228 } 229 230 /** 231 * @param string $html Source HTML 232 * @param array $options Set configuration options 233 */ 234 public function __construct($html = '', $options = array()) 235 { 236 // for backwards compatibility 237 if (!is_array($options)) { 238 return call_user_func_array(array($this, 'legacyConstruct'), func_get_args()); 239 } 240 241 $this->html = $html; 242 $this->options = array_merge($this->options, $options); 243 $this->htmlFuncFlags = (PHP_VERSION_ID < 50400) 244 ? ENT_COMPAT 245 : ENT_COMPAT | ENT_HTML5; 246 } 247 248 /** 249 * Get the source HTML 250 * 251 * @return string 252 */ 253 public function getHtml() 254 { 255 return $this->html; 256 } 257 258 /** 259 * Set the source HTML 260 * 261 * @param string $html HTML source content 262 */ 263 public function setHtml($html) 264 { 265 $this->html = $html; 266 $this->converted = false; 267 } 268 269 /** 270 * @deprecated 271 */ 272 public function set_html($html, $from_file = false) 273 { 274 if ($from_file) { 275 throw new \InvalidArgumentException("Argument from_file no longer supported"); 276 } 277 278 return $this->setHtml($html); 279 } 280 281 /** 282 * Returns the text, converted from HTML. 283 * 284 * @return string 285 */ 286 public function getText() 287 { 288 if (!$this->converted) { 289 $this->convert(); 290 } 291 292 return $this->text; 293 } 294 295 /** 296 * @deprecated 297 */ 298 public function get_text() 299 { 300 return $this->getText(); 301 } 302 303 /** 304 * @deprecated 305 */ 306 public function print_text() 307 { 308 print $this->getText(); 309 } 310 311 /** 312 * @deprecated 313 */ 314 public function p() 315 { 316 return $this->print_text(); 317 } 318 319 /** 320 * Sets a base URL to handle relative links. 321 * 322 * @param string $baseurl 323 */ 324 public function setBaseUrl($baseurl) 325 { 326 $this->baseurl = $baseurl; 327 } 328 329 /** 330 * @deprecated 331 */ 332 public function set_base_url($baseurl) 333 { 334 return $this->setBaseUrl($baseurl); 335 } 336 337 protected function convert() 338 { 339 $origEncoding = mb_internal_encoding(); 340 mb_internal_encoding(self::ENCODING); 341 342 $this->doConvert(); 343 344 mb_internal_encoding($origEncoding); 345 } 346 347 protected function doConvert() 348 { 349 $this->linkList = array(); 350 351 $text = trim($this->html); 352 353 $this->converter($text); 354 355 if ($this->linkList) { 356 $text .= "\n\nLinks:\n------\n"; 357 foreach ($this->linkList as $i => $url) { 358 $text .= '[' . ($i + 1) . '] ' . $url . "\n"; 359 } 360 } 361 362 $this->text = $text; 363 364 $this->converted = true; 365 } 366 367 protected function converter(&$text) 368 { 369 $this->convertBlockquotes($text); 370 $this->convertPre($text); 371 $text = preg_replace($this->search, $this->replace, $text); 372 $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text); 373 $text = strip_tags($text); 374 $text = preg_replace($this->entSearch, $this->entReplace, $text); 375 $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING); 376 377 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 378 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); 379 380 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities 381 // This properly handles situation of "&quot;" in input string 382 $text = str_replace('|+|amp|+|', '&', $text); 383 384 // Normalise empty lines 385 $text = preg_replace("/\n\s+\n/", "\n\n", $text); 386 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 387 388 // remove leading empty lines (can be produced by eg. P tag on the beginning) 389 $text = ltrim($text, "\n"); 390 391 if ($this->options['width'] > 0) { 392 $text = wordwrap($text, $this->options['width']); 393 } 394 } 395 396 /** 397 * Helper function called by preg_replace() on link replacement. 398 * 399 * Maintains an internal list of links to be displayed at the end of the 400 * text, with numeric indices to the original point in the text they 401 * appeared. Also makes an effort at identifying and handling absolute 402 * and relative links. 403 * 404 * @param string $link URL of the link 405 * @param string $display Part of the text to associate number with 406 * @param null $linkOverride 407 * @return string 408 */ 409 protected function buildlinkList($link, $display, $linkOverride = null) 410 { 411 $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links']; 412 if ($linkMethod == 'none') { 413 return $display; 414 } 415 416 // Ignored link types 417 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { 418 return $display; 419 } 420 421 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { 422 $url = $link; 423 } else { 424 $url = $this->baseurl; 425 if (mb_substr($link, 0, 1) != '/') { 426 $url .= '/'; 427 } 428 $url .= $link; 429 } 430 431 if ($linkMethod == 'table') { 432 if (($index = array_search($url, $this->linkList)) === false) { 433 $index = count($this->linkList); 434 $this->linkList[] = $url; 435 } 436 437 return $display . ' [' . ($index + 1) . ']'; 438 } elseif ($linkMethod == 'nextline') { 439 if ($url === $display) { 440 return $display; 441 } 442 return $display . "\n[" . $url . ']'; 443 } elseif ($linkMethod == 'bbcode') { 444 return sprintf('[url=%s]%s[/url]', $url, $display); 445 } else { // link_method defaults to inline 446 if ($url === $display) { 447 return $display; 448 } 449 return $display . ' [' . $url . ']'; 450 } 451 } 452 453 protected function convertPre(&$text) 454 { 455 // get the content of PRE element 456 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 457 // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace 458 $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]); 459 460 // Run our defined tags search-and-replace with callback 461 $this->preContent = preg_replace_callback( 462 $this->callbackSearch, 463 array($this, 'pregCallback'), 464 $this->preContent 465 ); 466 467 // convert the content 468 $this->preContent = sprintf( 469 '<div><br>%s<br></div>', 470 preg_replace($this->preSearch, $this->preReplace, $this->preContent) 471 ); 472 473 // replace the content (use callback because content can contain $0 variable) 474 $text = preg_replace_callback( 475 '/<pre[^>]*>.*<\/pre>/ismU', 476 array($this, 'pregPreCallback'), 477 $text, 478 1 479 ); 480 481 // free memory 482 $this->preContent = ''; 483 } 484 } 485 486 /** 487 * Helper function for BLOCKQUOTE body conversion. 488 * 489 * @param string $text HTML content 490 */ 491 protected function convertBlockquotes(&$text) 492 { 493 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { 494 $originalText = $text; 495 $start = 0; 496 $taglen = 0; 497 $level = 0; 498 $diff = 0; 499 foreach ($matches[0] as $m) { 500 $m[1] = mb_strlen(substr($originalText, 0, $m[1])); 501 if ($m[0][0] == '<' && $m[0][1] == '/') { 502 $level--; 503 if ($level < 0) { 504 $level = 0; // malformed HTML: go to next blockquote 505 } elseif ($level > 0) { 506 // skip inner blockquote 507 } else { 508 $end = $m[1]; 509 $len = $end - $taglen - $start; 510 // Get blockquote content 511 $body = mb_substr($text, $start + $taglen - $diff, $len); 512 513 // Set text width 514 $pWidth = $this->options['width']; 515 if ($this->options['width'] > 0) $this->options['width'] -= 2; 516 // Convert blockquote content 517 $body = trim($body); 518 $this->converter($body); 519 // Add citation markers and create PRE block 520 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); 521 $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>'; 522 // Re-set text width 523 $this->options['width'] = $pWidth; 524 // Replace content 525 $text = mb_substr($text, 0, $start - $diff) 526 . $body 527 . mb_substr($text, $end + mb_strlen($m[0]) - $diff); 528 529 $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body); 530 unset($body); 531 } 532 } else { 533 if ($level == 0) { 534 $start = $m[1]; 535 $taglen = mb_strlen($m[0]); 536 } 537 $level++; 538 } 539 } 540 } 541 } 542 543 /** 544 * Callback function for preg_replace_callback use. 545 * 546 * @param array $matches PREG matches 547 * @return string 548 */ 549 protected function pregCallback($matches) 550 { 551 switch (mb_strtolower($matches[1])) { 552 case 'p': 553 // Replace newlines with spaces. 554 $para = str_replace("\n", " ", $matches[3]); 555 556 // Trim trailing and leading whitespace within the tag. 557 $para = trim($para); 558 559 // Add trailing newlines for this para. 560 return "\n" . $para . "\n"; 561 case 'br': 562 return "\n"; 563 case 'b': 564 case 'strong': 565 return $this->toupper($matches[3]); 566 case 'th': 567 return $this->toupper("\t\t" . $matches[3] . "\n"); 568 case 'h': 569 return $this->toupper("\n\n" . $matches[3] . "\n\n"); 570 case 'a': 571 // override the link method 572 $linkOverride = null; 573 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) { 574 $linkOverride = $linkOverrideMatch[1]; 575 } 576 // Remove spaces in URL (#1487805) 577 $url = str_replace(' ', '', $matches[3]); 578 579 return $this->buildlinkList($url, $matches[5], $linkOverride); 580 } 581 582 return ''; 583 } 584 585 /** 586 * Callback function for preg_replace_callback use in PRE content handler. 587 * 588 * @param array $matches PREG matches 589 * @return string 590 */ 591 protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches) 592 { 593 return $this->preContent; 594 } 595 596 /** 597 * Strtoupper function with HTML tags and entities handling. 598 * 599 * @param string $str Text to convert 600 * @return string Converted text 601 */ 602 protected function toupper($str) 603 { 604 // string can contain HTML tags 605 $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 606 607 // convert toupper only the text between HTML tags 608 foreach ($chunks as $i => $chunk) { 609 if ($chunk[0] != '<') { 610 $chunks[$i] = $this->strtoupper($chunk); 611 } 612 } 613 614 return implode($chunks); 615 } 616 617 /** 618 * Strtoupper multibyte wrapper function with HTML entities handling. 619 * 620 * @param string $str Text to convert 621 * @return string Converted text 622 */ 623 protected function strtoupper($str) 624 { 625 $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING); 626 $str = mb_strtoupper($str); 627 $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING); 628 629 return $str; 630 } 631 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body