Differences Between: [Versions 310 and 402] [Versions 39 and 402]
1 <?php 2 3 /* 4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> 5 * 6 * This script is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * The GNU General Public License can be found at 12 * http://www.gnu.org/copyleft/gpl.html. 13 * 14 * This script is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 namespace Html2Text; 21 22 class Html2Text 23 { 24 const ENCODING = 'UTF-8'; 25 26 protected $htmlFuncFlags; 27 28 /** 29 * Contains the HTML content to convert. 30 * 31 * @var string $html 32 */ 33 protected $html; 34 35 /** 36 * Contains the converted, formatted text. 37 * 38 * @var string $text 39 */ 40 protected $text; 41 42 /** 43 * List of preg* regular expression patterns to search for, 44 * used in conjunction with $replace. 45 * 46 * @var array $search 47 * @see $replace 48 */ 49 protected $search = array( 50 "/\r/", // Non-legal carriage return 51 "/[\n\t]+/", // Newlines and tabs 52 '/<head\b[^>]*>.*?<\/head>/i', // <head> 53 '/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 54 '/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 55 '/<i\b[^>]*>(.*?)<\/i>/i', // <i> 56 '/<em\b[^>]*>(.*?)<\/em>/i', // <em> 57 '/<ins\b[^>]*>(.*?)<\/ins>/i', // <ins> 58 '/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul> 59 '/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol> 60 '/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl> 61 '/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li> 62 '/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd> 63 '/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt> 64 '/<li\b[^>]*>/i', // <li> 65 '/<hr\b[^>]*>/i', // <hr> 66 '/<div\b[^>]*>/i', // <div> 67 '/(<table\b[^>]*>|<\/table>)/i', // <table> and </table> 68 '/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr> 69 '/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td> 70 '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span> 71 '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag 72 ); 73 74 /** 75 * List of pattern replacements corresponding to patterns searched. 76 * 77 * @var array $replace 78 * @see $search 79 */ 80 protected $replace = array( 81 '', // Non-legal carriage return 82 ' ', // Newlines and tabs 83 '', // <head> 84 '', // <script>s -- which strip_tags supposedly has problems with 85 '', // <style>s -- which strip_tags supposedly has problems with 86 '_\\1_', // <i> 87 '_\\1_', // <em> 88 '_\\1_', // <ins> 89 "\n\n", // <ul> and </ul> 90 "\n\n", // <ol> and </ol> 91 "\n\n", // <dl> and </dl> 92 "\t* \\1\n", // <li> and </li> 93 " \\1\n", // <dd> and </dd> 94 "\t* \\1", // <dt> and </dt> 95 "\n\t* ", // <li> 96 "\n-------------------------\n", // <hr> 97 "<div>\n", // <div> 98 "\n\n", // <table> and </table> 99 "\n", // <tr> and </tr> 100 "\t\t\\1\n", // <td> and </td> 101 "", // <span class="_html2text_ignore">...</span> 102 '[\\2]', // <img> with alt tag 103 ); 104 105 /** 106 * List of preg* regular expression patterns to search for, 107 * used in conjunction with $entReplace. 108 * 109 * @var array $entSearch 110 * @see $entReplace 111 */ 112 protected $entSearch = array( 113 '/™/i', // TM symbol in win-1252 114 '/—/i', // m-dash in win-1252 115 '/&(amp|#38);/i', // Ampersand: see converter() 116 '/[ ]{2,}/', // Runs of spaces, post-handling 117 '/'/i', // The apostrophe symbol 118 ); 119 120 /** 121 * List of pattern replacements corresponding to patterns searched. 122 * 123 * @var array $entReplace 124 * @see $entSearch 125 */ 126 protected $entReplace = array( 127 '™', // TM symbol 128 '—', // m-dash 129 '|+|amp|+|', // Ampersand: see converter() 130 ' ', // Runs of spaces, post-handling 131 '\'', // Apostrophe 132 ); 133 134 /** 135 * List of preg* regular expression patterns to search for 136 * and replace using callback function. 137 * 138 * @var array $callbackSearch 139 */ 140 protected $callbackSearch = array( 141 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 142 '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace. 143 '/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline. 144 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> 145 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> 146 '/<(del)( [^>]*)?>(.*?)<\/del>/i', // <del> 147 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> 148 '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href=""> 149 ); 150 151 /** 152 * List of preg* regular expression patterns to search for in PRE body, 153 * used in conjunction with $preReplace. 154 * 155 * @var array $preSearch 156 * @see $preReplace 157 */ 158 protected $preSearch = array( 159 "/\n/", 160 "/\t/", 161 '/ /', 162 '/<pre[^>]*>/', 163 '/<\/pre>/' 164 ); 165 166 /** 167 * List of pattern replacements corresponding to patterns searched for PRE body. 168 * 169 * @var array $preReplace 170 * @see $preSearch 171 */ 172 protected $preReplace = array( 173 '<br>', 174 ' ', 175 ' ', 176 '', 177 '', 178 ); 179 180 /** 181 * Temporary workspace used during PRE processing. 182 * 183 * @var string $preContent 184 */ 185 protected $preContent = ''; 186 187 /** 188 * Contains the base URL that relative links should resolve to. 189 * 190 * @var string $baseurl 191 */ 192 protected $baseurl = ''; 193 194 /** 195 * Indicates whether content in the $html variable has been converted yet. 196 * 197 * @var boolean $converted 198 * @see $html, $text 199 */ 200 protected $converted = false; 201 202 /** 203 * Contains URL addresses from links to be rendered in plain text. 204 * 205 * @var array $linkList 206 * @see buildlinkList() 207 */ 208 protected $linkList = array(); 209 210 /** 211 * Various configuration options (able to be set in the constructor) 212 * 213 * @var array $options 214 */ 215 protected $options = array( 216 'do_links' => 'inline', // 'none' 217 // 'inline' (show links inline) 218 // 'nextline' (show links on the next line) 219 // 'table' (if a table of link URLs should be listed after the text. 220 // 'bbcode' (show links as bbcode) 221 222 'width' => 70, // Maximum width of the formatted text, in columns. 223 // Set this value to 0 (or less) to ignore word wrapping 224 // and not constrain text to a fixed-width column. 225 ); 226 227 private function legacyConstruct($html = '', $fromFile = false, array $options = array()) 228 { 229 $this->set_html($html, $fromFile); 230 $this->options = array_merge($this->options, $options); 231 } 232 233 /** 234 * @param string $html Source HTML 235 * @param array $options Set configuration options 236 */ 237 public function __construct($html = '', $options = array()) 238 { 239 // for backwards compatibility 240 if (!is_array($options)) { 241 return call_user_func_array(array($this, 'legacyConstruct'), func_get_args()); 242 } 243 244 $this->html = $html; 245 $this->options = array_merge($this->options, $options); 246 $this->htmlFuncFlags = (PHP_VERSION_ID < 50400) 247 ? ENT_COMPAT 248 : ENT_COMPAT | ENT_HTML5; 249 } 250 251 /** 252 * Get the source HTML 253 * 254 * @return string 255 */ 256 public function getHtml() 257 { 258 return $this->html; 259 } 260 261 /** 262 * Set the source HTML 263 * 264 * @param string $html HTML source content 265 */ 266 public function setHtml($html) 267 { 268 $this->html = $html; 269 $this->converted = false; 270 } 271 272 /** 273 * @deprecated 274 */ 275 public function set_html($html, $from_file = false) 276 { 277 if ($from_file) { 278 throw new \InvalidArgumentException("Argument from_file no longer supported"); 279 } 280 281 return $this->setHtml($html); 282 } 283 284 /** 285 * Returns the text, converted from HTML. 286 * 287 * @return string Plain text 288 */ 289 public function getText() 290 { 291 if (!$this->converted) { 292 $this->convert(); 293 } 294 295 return $this->text; 296 } 297 298 /** 299 * @deprecated 300 */ 301 public function get_text() 302 { 303 return $this->getText(); 304 } 305 306 /** 307 * @deprecated 308 */ 309 public function print_text() 310 { 311 print $this->getText(); 312 } 313 314 /** 315 * @deprecated 316 */ 317 public function p() 318 { 319 return $this->print_text(); 320 } 321 322 /** 323 * Sets a base URL to handle relative links. 324 * 325 * @param string $baseurl 326 */ 327 public function setBaseUrl($baseurl) 328 { 329 $this->baseurl = $baseurl; 330 } 331 332 /** 333 * @deprecated 334 */ 335 public function set_base_url($baseurl) 336 { 337 return $this->setBaseUrl($baseurl); 338 } 339 340 protected function convert() 341 { 342 $origEncoding = mb_internal_encoding(); 343 mb_internal_encoding(self::ENCODING); 344 345 $this->doConvert(); 346 347 mb_internal_encoding($origEncoding); 348 } 349 350 protected function doConvert() 351 { 352 $this->linkList = array(); 353 354 $text = trim($this->html); 355 356 $this->converter($text); 357 358 if ($this->linkList) { 359 $text .= "\n\nLinks:\n------\n"; 360 foreach ($this->linkList as $i => $url) { 361 $text .= '[' . ($i + 1) . '] ' . $url . "\n"; 362 } 363 } 364 365 $this->text = $text; 366 367 $this->converted = true; 368 } 369 370 protected function converter(&$text) 371 { 372 $this->convertBlockquotes($text); 373 $this->convertPre($text); 374 $text = preg_replace($this->search, $this->replace, $text); 375 $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text); 376 $text = strip_tags($text); 377 $text = preg_replace($this->entSearch, $this->entReplace, $text); 378 $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING); 379 380 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 381 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); 382 383 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities 384 // This properly handles situation of "&quot;" in input string 385 $text = str_replace('|+|amp|+|', '&', $text); 386 387 // Normalise empty lines 388 $text = preg_replace("/\n\s+\n/", "\n\n", $text); 389 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 390 391 // remove leading empty lines (can be produced by eg. P tag on the beginning) 392 $text = ltrim($text, "\n"); 393 394 if ($this->options['width'] > 0) { 395 $text = wordwrap($text, $this->options['width']); 396 } 397 } 398 399 /** 400 * Helper function called by preg_replace() on link replacement. 401 * 402 * Maintains an internal list of links to be displayed at the end of the 403 * text, with numeric indices to the original point in the text they 404 * appeared. Also makes an effort at identifying and handling absolute 405 * and relative links. 406 * 407 * @param string $link URL of the link 408 * @param string $display Part of the text to associate number with 409 * @param null $linkOverride 410 * @return string 411 */ 412 protected function buildlinkList($link, $display, $linkOverride = null) 413 { 414 $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links']; 415 if ($linkMethod == 'none') { 416 return $display; 417 } 418 419 // Ignored link types 420 if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link))) { 421 return $display; 422 } 423 424 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { 425 $url = $link; 426 } else { 427 $url = $this->baseurl; 428 if (mb_substr($link, 0, 1) != '/') { 429 $url .= '/'; 430 } 431 $url .= $link; 432 } 433 434 if ($linkMethod == 'table') { 435 if (($index = array_search($url, $this->linkList)) === false) { 436 $index = count($this->linkList); 437 $this->linkList[] = $url; 438 } 439 440 return $display . ' [' . ($index + 1) . ']'; 441 } elseif ($linkMethod == 'nextline') { 442 if ($url === $display) { 443 return $display; 444 } 445 return $display . "\n[" . $url . ']'; 446 } elseif ($linkMethod == 'bbcode') { 447 return sprintf('[url=%s]%s[/url]', $url, $display); 448 } else { // link_method defaults to inline 449 if ($url === $display) { 450 return $display; 451 } 452 return $display . ' [' . $url . ']'; 453 } 454 } 455 456 /** 457 * Helper function for PRE body conversion. 458 * 459 * @param string &$text HTML content 460 */ 461 protected function convertPre(&$text) 462 { 463 // get the content of PRE element 464 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 465 // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace 466 $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]); 467 468 // Run our defined tags search-and-replace with callback 469 $this->preContent = preg_replace_callback( 470 $this->callbackSearch, 471 array($this, 'pregCallback'), 472 $this->preContent 473 ); 474 475 // convert the content 476 $this->preContent = sprintf( 477 '<div><br>%s<br></div>', 478 preg_replace($this->preSearch, $this->preReplace, $this->preContent) 479 ); 480 481 // replace the content (use callback because content can contain $0 variable) 482 $text = preg_replace_callback( 483 '/<pre[^>]*>.*<\/pre>/ismU', 484 array($this, 'pregPreCallback'), 485 $text, 486 1 487 ); 488 489 // free memory 490 $this->preContent = ''; 491 } 492 } 493 494 /** 495 * Helper function for BLOCKQUOTE body conversion. 496 * 497 * @param string &$text HTML content 498 */ 499 protected function convertBlockquotes(&$text) 500 { 501 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { 502 $originalText = $text; 503 $start = 0; 504 $taglen = 0; 505 $level = 0; 506 $diff = 0; 507 foreach ($matches[0] as $m) { 508 $m[1] = mb_strlen(substr($originalText, 0, $m[1])); 509 if ($m[0][0] == '<' && $m[0][1] == '/') { 510 $level--; 511 if ($level < 0) { 512 $level = 0; // malformed HTML: go to next blockquote 513 } elseif ($level > 0) { 514 // skip inner blockquote 515 } else { 516 $end = $m[1]; 517 $len = $end - $taglen - $start; 518 // Get blockquote content 519 $body = mb_substr($text, $start + $taglen - $diff, $len); 520 521 // Set text width 522 $pWidth = $this->options['width']; 523 if ($this->options['width'] > 0) $this->options['width'] -= 2; 524 // Convert blockquote content 525 $body = trim($body); 526 $this->converter($body); 527 // Add citation markers and create PRE block 528 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); 529 $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>'; 530 // Re-set text width 531 $this->options['width'] = $pWidth; 532 // Replace content 533 $text = mb_substr($text, 0, $start - $diff) 534 . $body 535 . mb_substr($text, $end + mb_strlen($m[0]) - $diff); 536 537 $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body); 538 unset($body); 539 } 540 } else { 541 if ($level == 0) { 542 $start = $m[1]; 543 $taglen = mb_strlen($m[0]); 544 } 545 $level++; 546 } 547 } 548 } 549 } 550 551 /** 552 * Callback function for preg_replace_callback use. 553 * 554 * @param array $matches PREG matches 555 * @return string 556 */ 557 protected function pregCallback($matches) 558 { 559 switch (mb_strtolower($matches[1])) { 560 case 'p': 561 // Replace newlines with spaces. 562 $para = str_replace("\n", " ", $matches[3]); 563 564 // Trim trailing and leading whitespace within the tag. 565 $para = trim($para); 566 567 // Add trailing newlines for this para. 568 return "\n" . $para . "\n"; 569 case 'br': 570 return "\n"; 571 case 'b': 572 case 'strong': 573 return $this->toupper($matches[3]); 574 case 'del': 575 return $this->tostrike($matches[3]); 576 case 'th': 577 return $this->toupper("\t\t" . $matches[3] . "\n"); 578 case 'h': 579 return $this->toupper("\n\n" . $matches[3] . "\n\n"); 580 case 'a': 581 // override the link method 582 $linkOverride = null; 583 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) { 584 $linkOverride = $linkOverrideMatch[1]; 585 } 586 // Remove spaces in URL (#1487805) 587 $url = str_replace(' ', '', $matches[3]); 588 589 return $this->buildlinkList($url, $matches[5], $linkOverride); 590 } 591 592 return ''; 593 } 594 595 /** 596 * Callback function for preg_replace_callback use in PRE content handler. 597 * 598 * @param array $matches PREG matches 599 * @return string 600 */ 601 protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches) 602 { 603 return $this->preContent; 604 } 605 606 /** 607 * Strtoupper function with HTML tags and entities handling. 608 * 609 * @param string $str Text to convert 610 * @return string Converted text 611 */ 612 protected function toupper($str) 613 { 614 // string can contain HTML tags 615 $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 616 617 // convert toupper only the text between HTML tags 618 foreach ($chunks as $i => $chunk) { 619 if ($chunk[0] != '<') { 620 $chunks[$i] = $this->strtoupper($chunk); 621 } 622 } 623 624 return implode($chunks); 625 } 626 627 /** 628 * Strtoupper multibyte wrapper function with HTML entities handling. 629 * 630 * @param string $str Text to convert 631 * @return string Converted text 632 */ 633 protected function strtoupper($str) 634 { 635 $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING); 636 $str = mb_strtoupper($str); 637 $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING); 638 639 return $str; 640 } 641 642 /** 643 * Helper function for DEL conversion. 644 * 645 * @param string $text HTML content 646 * @return string Converted text 647 */ 648 protected function tostrike($str) 649 { 650 $rtn = ''; 651 for ($i = 0; $i < mb_strlen($str); $i++) { 652 $chr = mb_substr($str, $i, 1); 653 $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F); 654 $rtn .= $chr . $combiningChr; 655 } 656 return $rtn; 657 } 658 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body