Differences Between: [Versions 310 and 402] [Versions 311 and 402] [Versions 39 and 402] [Versions 400 and 402]
1 <?php 2 /** 3 * Markdown - A text-to-HTML conversion tool for web writers 4 * 5 * @package php-markdown 6 * @author Michel Fortin <michel.fortin@michelf.com> 7 * @copyright 2004-2022 Michel Fortin <https://michelf.com/projects/php-markdown/> 8 * @copyright (Original Markdown) 2004-2006 John Gruber <https://daringfireball.net/projects/markdown/> 9 */ 10 11 namespace Michelf; 12 13 /** 14 * Markdown Parser Class 15 */ 16 class Markdown implements MarkdownInterface { 17 /** 18 * Define the package version 19 * @var string 20 */ 21 const MARKDOWNLIB_VERSION = "2.0.0"; 22 23 /** 24 * Simple function interface - Initialize the parser and return the result 25 * of its transform method. This will work fine for derived classes too. 26 * 27 * @api 28 * 29 * @param string $text 30 * @return string 31 */ 32 public static function defaultTransform(string $text): string { 33 // Take parser class on which this function was called. 34 $parser_class = static::class; 35 36 // Try to take parser from the static parser list 37 static $parser_list; 38 $parser =& $parser_list[$parser_class]; 39 40 // Create the parser it not already set 41 if (!$parser) { 42 $parser = new $parser_class; 43 } 44 45 // Transform text using parser. 46 return $parser->transform($text); 47 } 48 49 /** 50 * Configuration variables 51 */ 52 /** 53 * Change to ">" for HTML output. 54 */ 55 public string $empty_element_suffix = " />"; 56 57 /** 58 * The width of indentation of the output markup 59 */ 60 public int $tab_width = 4; 61 62 /** 63 * Change to `true` to disallow markup or entities. 64 */ 65 public bool $no_markup = false; 66 public bool $no_entities = false; 67 68 69 /** 70 * Change to `true` to enable line breaks on \n without two trailling spaces 71 * @var boolean 72 */ 73 public bool $hard_wrap = false; 74 75 /** 76 * Predefined URLs and titles for reference links and images. 77 */ 78 public array $predef_urls = array(); 79 public array $predef_titles = array(); 80 81 /** 82 * Optional filter function for URLs 83 * @var callable|null 84 */ 85 public $url_filter_func = null; 86 87 /** 88 * Optional header id="" generation callback function. 89 * @var callable|null 90 */ 91 public $header_id_func = null; 92 93 /** 94 * Optional function for converting code block content to HTML 95 * @var callable|null 96 */ 97 public $code_block_content_func = null; 98 99 /** 100 * Optional function for converting code span content to HTML. 101 * @var callable|null 102 */ 103 public $code_span_content_func = null; 104 105 /** 106 * Class attribute to toggle "enhanced ordered list" behaviour 107 * setting this to true will allow ordered lists to start from the index 108 * number that is defined first. 109 * 110 * For example: 111 * 2. List item two 112 * 3. List item three 113 * 114 * Becomes: 115 * <ol start="2"> 116 * <li>List item two</li> 117 * <li>List item three</li> 118 * </ol> 119 */ 120 public bool $enhanced_ordered_list = false; 121 122 /** 123 * Parser implementation 124 */ 125 /** 126 * Regex to match balanced [brackets]. 127 * Needed to insert a maximum bracked depth while converting to PHP. 128 */ 129 protected int $nested_brackets_depth = 6; 130 protected string $nested_brackets_re; 131 132 protected int $nested_url_parenthesis_depth = 4; 133 protected string $nested_url_parenthesis_re; 134 135 /** 136 * Table of hash values for escaped characters: 137 */ 138 protected string $escape_chars = '\`*_{}[]()>#+-.!'; 139 protected string $escape_chars_re; 140 141 /** 142 * Constructor function. Initialize appropriate member variables. 143 * @return void 144 */ 145 public function __construct() { 146 $this->_initDetab(); 147 $this->prepareItalicsAndBold(); 148 149 $this->nested_brackets_re = 150 str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth). 151 str_repeat('\])*', $this->nested_brackets_depth); 152 153 $this->nested_url_parenthesis_re = 154 str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth). 155 str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth); 156 157 $this->escape_chars_re = '['.preg_quote($this->escape_chars).']'; 158 159 // Sort document, block, and span gamut in ascendent priority order. 160 asort($this->document_gamut); 161 asort($this->block_gamut); 162 asort($this->span_gamut); 163 } 164 165 166 /** 167 * Internal hashes used during transformation. 168 */ 169 protected array $urls = array(); 170 protected array $titles = array(); 171 protected array $html_hashes = array(); 172 173 /** 174 * Status flag to avoid invalid nesting. 175 */ 176 protected bool $in_anchor = false; 177 178 /** 179 * Status flag to avoid invalid nesting. 180 */ 181 protected bool $in_emphasis_processing = false; 182 183 /** 184 * Called before the transformation process starts to setup parser states. 185 * @return void 186 */ 187 protected function setup() { 188 // Clear global hashes. 189 $this->urls = $this->predef_urls; 190 $this->titles = $this->predef_titles; 191 $this->html_hashes = array(); 192 $this->in_anchor = false; 193 $this->in_emphasis_processing = false; 194 } 195 196 /** 197 * Called after the transformation process to clear any variable which may 198 * be taking up memory unnecessarly. 199 * @return void 200 */ 201 protected function teardown() { 202 $this->urls = array(); 203 $this->titles = array(); 204 $this->html_hashes = array(); 205 } 206 207 /** 208 * Main function. Performs some preprocessing on the input text and pass 209 * it through the document gamut. 210 * 211 * @api 212 * 213 * @param string $text 214 * @return string 215 */ 216 public function transform(string $text): string { 217 $this->setup(); 218 219 # Remove UTF-8 BOM and marker character in input, if present. 220 $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text); 221 222 # Standardize line endings: 223 # DOS to Unix and Mac to Unix 224 $text = preg_replace('{\r\n?}', "\n", $text); 225 226 # Make sure $text ends with a couple of newlines: 227 $text .= "\n\n"; 228 229 # Convert all tabs to spaces. 230 $text = $this->detab($text); 231 232 # Turn block-level HTML blocks into hash entries 233 $text = $this->hashHTMLBlocks($text); 234 235 # Strip any lines consisting only of spaces and tabs. 236 # This makes subsequent regexen easier to write, because we can 237 # match consecutive blank lines with /\n+/ instead of something 238 # contorted like /[ ]*\n+/ . 239 $text = preg_replace('/^[ ]+$/m', '', $text); 240 241 # Run document gamut methods. 242 foreach ($this->document_gamut as $method => $priority) { 243 $text = $this->$method($text); 244 } 245 246 $this->teardown(); 247 248 return $text . "\n"; 249 } 250 251 /** 252 * Define the document gamut 253 */ 254 protected array $document_gamut = array( 255 // Strip link definitions, store in hashes. 256 "stripLinkDefinitions" => 20, 257 "runBasicBlockGamut" => 30, 258 ); 259 260 /** 261 * Strips link definitions from text, stores the URLs and titles in 262 * hash references 263 * @param string $text 264 * @return string 265 */ 266 protected function stripLinkDefinitions($text) { 267 268 $less_than_tab = $this->tab_width - 1; 269 270 // Link defs are in the form: ^[id]: url "optional title" 271 $text = preg_replace_callback('{ 272 ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1 273 [ ]* 274 \n? # maybe *one* newline 275 [ ]* 276 (?: 277 <(.+?)> # url = $2 278 | 279 (\S+?) # url = $3 280 ) 281 [ ]* 282 \n? # maybe one newline 283 [ ]* 284 (?: 285 (?<=\s) # lookbehind for whitespace 286 ["(] 287 (.*?) # title = $4 288 [")] 289 [ ]* 290 )? # title is optional 291 (?:\n+|\Z) 292 }xm', 293 array($this, '_stripLinkDefinitions_callback'), 294 $text 295 ); 296 return $text; 297 } 298 299 /** 300 * The callback to strip link definitions 301 * @param array $matches 302 * @return string 303 */ 304 protected function _stripLinkDefinitions_callback($matches) { 305 $link_id = strtolower($matches[1]); 306 $url = $matches[2] == '' ? $matches[3] : $matches[2]; 307 $this->urls[$link_id] = $url; 308 $this->titles[$link_id] =& $matches[4]; 309 return ''; // String that will replace the block 310 } 311 312 /** 313 * Hashify HTML blocks 314 * @param string $text 315 * @return string 316 */ 317 protected function hashHTMLBlocks($text) { 318 if ($this->no_markup) { 319 return $text; 320 } 321 322 $less_than_tab = $this->tab_width - 1; 323 324 /** 325 * Hashify HTML blocks: 326 * 327 * We only want to do this for block-level HTML tags, such as headers, 328 * lists, and tables. That's because we still want to wrap <p>s around 329 * "paragraphs" that are wrapped in non-block-level tags, such as 330 * anchors, phrase emphasis, and spans. The list of tags we're looking 331 * for is hard-coded: 332 * 333 * * List "a" is made of tags which can be both inline or block-level. 334 * These will be treated block-level when the start tag is alone on 335 * its line, otherwise they're not matched here and will be taken as 336 * inline later. 337 * * List "b" is made of tags which are always block-level; 338 */ 339 $block_tags_a_re = 'ins|del'; 340 $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'. 341 'script|noscript|style|form|fieldset|iframe|math|svg|'. 342 'article|section|nav|aside|hgroup|header|footer|'. 343 'figure|details|summary'; 344 345 // Regular expression for the content of a block tag. 346 $nested_tags_level = 4; 347 $attr = ' 348 (?> # optional tag attributes 349 \s # starts with whitespace 350 (?> 351 [^>"/]+ # text outside quotes 352 | 353 /+(?!>) # slash not followed by ">" 354 | 355 "[^"]*" # text inside double quotes (tolerate ">") 356 | 357 \'[^\']*\' # text inside single quotes (tolerate ">") 358 )* 359 )? 360 '; 361 $content = 362 str_repeat(' 363 (?> 364 [^<]+ # content without tag 365 | 366 <\2 # nested opening tag 367 '.$attr.' # attributes 368 (?> 369 /> 370 | 371 >', $nested_tags_level). // end of opening tag 372 '.*?'. // last level nested tag content 373 str_repeat(' 374 </\2\s*> # closing nested tag 375 ) 376 | 377 <(?!/\2\s*> # other tags with a different name 378 ) 379 )*', 380 $nested_tags_level); 381 $content2 = str_replace('\2', '\3', $content); 382 383 /** 384 * First, look for nested blocks, e.g.: 385 * <div> 386 * <div> 387 * tags for inner block must be indented. 388 * </div> 389 * </div> 390 * 391 * The outermost tags must start at the left margin for this to match, 392 * and the inner nested divs must be indented. 393 * We need to do this before the next, more liberal match, because the 394 * next match will start at the first `<div>` and stop at the 395 * first `</div>`. 396 */ 397 $text = preg_replace_callback('{(?> 398 (?> 399 (?<=\n) # Starting on its own line 400 | # or 401 \A\n? # the at beginning of the doc 402 ) 403 ( # save in $1 404 405 # Match from `\n<tag>` to `</tag>\n`, handling nested tags 406 # in between. 407 408 [ ]{0,'.$less_than_tab.'} 409 <('.$block_tags_b_re.')# start tag = $2 410 '.$attr.'> # attributes followed by > and \n 411 '.$content.' # content, support nesting 412 </\2> # the matching end tag 413 [ ]* # trailing spaces/tabs 414 (?=\n+|\Z) # followed by a newline or end of document 415 416 | # Special version for tags of group a. 417 418 [ ]{0,'.$less_than_tab.'} 419 <('.$block_tags_a_re.')# start tag = $3 420 '.$attr.'>[ ]*\n # attributes followed by > 421 '.$content2.' # content, support nesting 422 </\3> # the matching end tag 423 [ ]* # trailing spaces/tabs 424 (?=\n+|\Z) # followed by a newline or end of document 425 426 | # Special case just for <hr />. It was easier to make a special 427 # case than to make the other regex more complicated. 428 429 [ ]{0,'.$less_than_tab.'} 430 <(hr) # start tag = $2 431 '.$attr.' # attributes 432 /?> # the matching end tag 433 [ ]* 434 (?=\n{2,}|\Z) # followed by a blank line or end of document 435 436 | # Special case for standalone HTML comments: 437 438 [ ]{0,'.$less_than_tab.'} 439 (?s: 440 <!-- .*? --> 441 ) 442 [ ]* 443 (?=\n{2,}|\Z) # followed by a blank line or end of document 444 445 | # PHP and ASP-style processor instructions (<? and <%) 446 447 [ ]{0,'.$less_than_tab.'} 448 (?s: 449 <([?%]) # $2 450 .*? 451 \2> 452 ) 453 [ ]* 454 (?=\n{2,}|\Z) # followed by a blank line or end of document 455 456 ) 457 )}Sxmi', 458 array($this, '_hashHTMLBlocks_callback'), 459 $text 460 ); 461 462 return $text; 463 } 464 465 /** 466 * The callback for hashing HTML blocks 467 * @param string $matches 468 * @return string 469 */ 470 protected function _hashHTMLBlocks_callback($matches) { 471 $text = $matches[1]; 472 $key = $this->hashBlock($text); 473 return "\n\n$key\n\n"; 474 } 475 476 /** 477 * Called whenever a tag must be hashed when a function insert an atomic 478 * element in the text stream. Passing $text to through this function gives 479 * a unique text-token which will be reverted back when calling unhash. 480 * 481 * The $boundary argument specify what character should be used to surround 482 * the token. By convension, "B" is used for block elements that needs not 483 * to be wrapped into paragraph tags at the end, ":" is used for elements 484 * that are word separators and "X" is used in the general case. 485 * 486 * @param string $text 487 * @param string $boundary 488 * @return string 489 */ 490 protected function hashPart($text, $boundary = 'X') { 491 // Swap back any tag hash found in $text so we do not have to `unhash` 492 // multiple times at the end. 493 $text = $this->unhash($text); 494 495 // Then hash the block. 496 static $i = 0; 497 $key = "$boundary\x1A" . ++$i . $boundary; 498 $this->html_hashes[$key] = $text; 499 return $key; // String that will replace the tag. 500 } 501 502 /** 503 * Shortcut function for hashPart with block-level boundaries. 504 * @param string $text 505 * @return string 506 */ 507 protected function hashBlock($text) { 508 return $this->hashPart($text, 'B'); 509 } 510 511 /** 512 * Define the block gamut - these are all the transformations that form 513 * block-level tags like paragraphs, headers, and list items. 514 */ 515 protected array $block_gamut = array( 516 "doHeaders" => 10, 517 "doHorizontalRules" => 20, 518 "doLists" => 40, 519 "doCodeBlocks" => 50, 520 "doBlockQuotes" => 60, 521 ); 522 523 /** 524 * Run block gamut tranformations. 525 * 526 * We need to escape raw HTML in Markdown source before doing anything 527 * else. This need to be done for each block, and not only at the 528 * begining in the Markdown function since hashed blocks can be part of 529 * list items and could have been indented. Indented blocks would have 530 * been seen as a code block in a previous pass of hashHTMLBlocks. 531 * 532 * @param string $text 533 * @return string 534 */ 535 protected function runBlockGamut($text) { 536 $text = $this->hashHTMLBlocks($text); 537 return $this->runBasicBlockGamut($text); 538 } 539 540 /** 541 * Run block gamut tranformations, without hashing HTML blocks. This is 542 * useful when HTML blocks are known to be already hashed, like in the first 543 * whole-document pass. 544 * 545 * @param string $text 546 * @return string 547 */ 548 protected function runBasicBlockGamut($text) { 549 550 foreach ($this->block_gamut as $method => $priority) { 551 $text = $this->$method($text); 552 } 553 554 // Finally form paragraph and restore hashed blocks. 555 $text = $this->formParagraphs($text); 556 557 return $text; 558 } 559 560 /** 561 * Convert horizontal rules 562 * @param string $text 563 * @return string 564 */ 565 protected function doHorizontalRules($text) { 566 return preg_replace( 567 '{ 568 ^[ ]{0,3} # Leading space 569 ([-*_]) # $1: First marker 570 (?> # Repeated marker group 571 [ ]{0,2} # Zero, one, or two spaces. 572 \1 # Marker character 573 ){2,} # Group repeated at least twice 574 [ ]* # Tailing spaces 575 $ # End of line. 576 }mx', 577 "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n", 578 $text 579 ); 580 } 581 582 /** 583 * These are all the transformations that occur *within* block-level 584 * tags like paragraphs, headers, and list items. 585 */ 586 protected array $span_gamut = array( 587 // Process character escapes, code spans, and inline HTML 588 // in one shot. 589 "parseSpan" => -30, 590 // Process anchor and image tags. Images must come first, 591 // because ![foo][f] looks like an anchor. 592 "doImages" => 10, 593 "doAnchors" => 20, 594 // Make links out of things like `<https://example.com/>` 595 // Must come after doAnchors, because you can use < and > 596 // delimiters in inline links like [this](<url>). 597 "doAutoLinks" => 30, 598 "encodeAmpsAndAngles" => 40, 599 "doItalicsAndBold" => 50, 600 "doHardBreaks" => 60, 601 ); 602 603 /** 604 * Run span gamut transformations 605 * @param string $text 606 * @return string 607 */ 608 protected function runSpanGamut($text) { 609 foreach ($this->span_gamut as $method => $priority) { 610 $text = $this->$method($text); 611 } 612 613 return $text; 614 } 615 616 /** 617 * Do hard breaks 618 * @param string $text 619 * @return string 620 */ 621 protected function doHardBreaks($text) { 622 if ($this->hard_wrap) { 623 return preg_replace_callback('/ *\n/', 624 array($this, '_doHardBreaks_callback'), $text); 625 } else { 626 return preg_replace_callback('/ {2,}\n/', 627 array($this, '_doHardBreaks_callback'), $text); 628 } 629 } 630 631 /** 632 * Trigger part hashing for the hard break (callback method) 633 * @param array $matches 634 * @return string 635 */ 636 protected function _doHardBreaks_callback($matches) { 637 return $this->hashPart("<br$this->empty_element_suffix\n"); 638 } 639 640 /** 641 * Turn Markdown link shortcuts into XHTML <a> tags. 642 * @param string $text 643 * @return string 644 */ 645 protected function doAnchors($text) { 646 if ($this->in_anchor) { 647 return $text; 648 } 649 $this->in_anchor = true; 650 651 // First, handle reference-style links: [link text] [id] 652 $text = preg_replace_callback('{ 653 ( # wrap whole match in $1 654 \[ 655 ('.$this->nested_brackets_re.') # link text = $2 656 \] 657 658 [ ]? # one optional space 659 (?:\n[ ]*)? # one optional newline followed by spaces 660 661 \[ 662 (.*?) # id = $3 663 \] 664 ) 665 }xs', 666 array($this, '_doAnchors_reference_callback'), $text); 667 668 // Next, inline-style links: [link text](url "optional title") 669 $text = preg_replace_callback('{ 670 ( # wrap whole match in $1 671 \[ 672 ('.$this->nested_brackets_re.') # link text = $2 673 \] 674 \( # literal paren 675 [ \n]* 676 (?: 677 <(.+?)> # href = $3 678 | 679 ('.$this->nested_url_parenthesis_re.') # href = $4 680 ) 681 [ \n]* 682 ( # $5 683 ([\'"]) # quote char = $6 684 (.*?) # Title = $7 685 \6 # matching quote 686 [ \n]* # ignore any spaces/tabs between closing quote and ) 687 )? # title is optional 688 \) 689 ) 690 }xs', 691 array($this, '_doAnchors_inline_callback'), $text); 692 693 // Last, handle reference-style shortcuts: [link text] 694 // These must come last in case you've also got [link text][1] 695 // or [link text](/foo) 696 $text = preg_replace_callback('{ 697 ( # wrap whole match in $1 698 \[ 699 ([^\[\]]+) # link text = $2; can\'t contain [ or ] 700 \] 701 ) 702 }xs', 703 array($this, '_doAnchors_reference_callback'), $text); 704 705 $this->in_anchor = false; 706 return $text; 707 } 708 709 /** 710 * Callback method to parse referenced anchors 711 * @param array $matches 712 * @return string 713 */ 714 protected function _doAnchors_reference_callback($matches) { 715 $whole_match = $matches[1]; 716 $link_text = $matches[2]; 717 $link_id =& $matches[3]; 718 719 if ($link_id == "") { 720 // for shortcut links like [this][] or [this]. 721 $link_id = $link_text; 722 } 723 724 // lower-case and turn embedded newlines into spaces 725 $link_id = strtolower($link_id); 726 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id); 727 728 if (isset($this->urls[$link_id])) { 729 $url = $this->urls[$link_id]; 730 $url = $this->encodeURLAttribute($url); 731 732 $result = "<a href=\"$url\""; 733 if ( isset( $this->titles[$link_id] ) ) { 734 $title = $this->titles[$link_id]; 735 $title = $this->encodeAttribute($title); 736 $result .= " title=\"$title\""; 737 } 738 739 $link_text = $this->runSpanGamut($link_text); 740 $result .= ">$link_text</a>"; 741 $result = $this->hashPart($result); 742 } else { 743 $result = $whole_match; 744 } 745 return $result; 746 } 747 748 /** 749 * Callback method to parse inline anchors 750 * @param array $matches 751 * @return string 752 */ 753 protected function _doAnchors_inline_callback($matches) { 754 $link_text = $this->runSpanGamut($matches[2]); 755 $url = $matches[3] === '' ? $matches[4] : $matches[3]; 756 $title =& $matches[7]; 757 758 // If the URL was of the form <s p a c e s> it got caught by the HTML 759 // tag parser and hashed. Need to reverse the process before using 760 // the URL. 761 $unhashed = $this->unhash($url); 762 if ($unhashed !== $url) 763 $url = preg_replace('/^<(.*)>$/', '\1', $unhashed); 764 765 $url = $this->encodeURLAttribute($url); 766 767 $result = "<a href=\"$url\""; 768 if ($title) { 769 $title = $this->encodeAttribute($title); 770 $result .= " title=\"$title\""; 771 } 772 773 $link_text = $this->runSpanGamut($link_text); 774 $result .= ">$link_text</a>"; 775 776 return $this->hashPart($result); 777 } 778 779 /** 780 * Turn Markdown image shortcuts into <img> tags. 781 * @param string $text 782 * @return string 783 */ 784 protected function doImages($text) { 785 // First, handle reference-style labeled images: ![alt text][id] 786 $text = preg_replace_callback('{ 787 ( # wrap whole match in $1 788 !\[ 789 ('.$this->nested_brackets_re.') # alt text = $2 790 \] 791 792 [ ]? # one optional space 793 (?:\n[ ]*)? # one optional newline followed by spaces 794 795 \[ 796 (.*?) # id = $3 797 \] 798 799 ) 800 }xs', 801 array($this, '_doImages_reference_callback'), $text); 802 803 // Next, handle inline images: ![alt text](url "optional title") 804 // Don't forget: encode * and _ 805 $text = preg_replace_callback('{ 806 ( # wrap whole match in $1 807 !\[ 808 ('.$this->nested_brackets_re.') # alt text = $2 809 \] 810 \s? # One optional whitespace character 811 \( # literal paren 812 [ \n]* 813 (?: 814 <(\S*)> # src url = $3 815 | 816 ('.$this->nested_url_parenthesis_re.') # src url = $4 817 ) 818 [ \n]* 819 ( # $5 820 ([\'"]) # quote char = $6 821 (.*?) # title = $7 822 \6 # matching quote 823 [ \n]* 824 )? # title is optional 825 \) 826 ) 827 }xs', 828 array($this, '_doImages_inline_callback'), $text); 829 830 return $text; 831 } 832 833 /** 834 * Callback to parse references image tags 835 * @param array $matches 836 * @return string 837 */ 838 protected function _doImages_reference_callback($matches) { 839 $whole_match = $matches[1]; 840 $alt_text = $matches[2]; 841 $link_id = strtolower($matches[3]); 842 843 if ($link_id == "") { 844 $link_id = strtolower($alt_text); // for shortcut links like ![this][]. 845 } 846 847 $alt_text = $this->encodeAttribute($alt_text); 848 if (isset($this->urls[$link_id])) { 849 $url = $this->encodeURLAttribute($this->urls[$link_id]); 850 $result = "<img src=\"$url\" alt=\"$alt_text\""; 851 if (isset($this->titles[$link_id])) { 852 $title = $this->titles[$link_id]; 853 $title = $this->encodeAttribute($title); 854 $result .= " title=\"$title\""; 855 } 856 $result .= $this->empty_element_suffix; 857 $result = $this->hashPart($result); 858 } else { 859 // If there's no such link ID, leave intact: 860 $result = $whole_match; 861 } 862 863 return $result; 864 } 865 866 /** 867 * Callback to parse inline image tags 868 * @param array $matches 869 * @return string 870 */ 871 protected function _doImages_inline_callback($matches) { 872 $whole_match = $matches[1]; 873 $alt_text = $matches[2]; 874 $url = $matches[3] == '' ? $matches[4] : $matches[3]; 875 $title =& $matches[7]; 876 877 $alt_text = $this->encodeAttribute($alt_text); 878 $url = $this->encodeURLAttribute($url); 879 $result = "<img src=\"$url\" alt=\"$alt_text\""; 880 if (isset($title)) { 881 $title = $this->encodeAttribute($title); 882 $result .= " title=\"$title\""; // $title already quoted 883 } 884 $result .= $this->empty_element_suffix; 885 886 return $this->hashPart($result); 887 } 888 889 /** 890 * Parse Markdown heading elements to HTML 891 * @param string $text 892 * @return string 893 */ 894 protected function doHeaders($text) { 895 /** 896 * Setext-style headers: 897 * Header 1 898 * ======== 899 * 900 * Header 2 901 * -------- 902 */ 903 $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx', 904 array($this, '_doHeaders_callback_setext'), $text); 905 906 /** 907 * atx-style headers: 908 * # Header 1 909 * ## Header 2 910 * ## Header 2 with closing hashes ## 911 * ... 912 * ###### Header 6 913 */ 914 $text = preg_replace_callback('{ 915 ^(\#{1,6}) # $1 = string of #\'s 916 [ ]* 917 (.+?) # $2 = Header text 918 [ ]* 919 \#* # optional closing #\'s (not counted) 920 \n+ 921 }xm', 922 array($this, '_doHeaders_callback_atx'), $text); 923 924 return $text; 925 } 926 927 /** 928 * Setext header parsing callback 929 * @param array $matches 930 * @return string 931 */ 932 protected function _doHeaders_callback_setext($matches) { 933 // Terrible hack to check we haven't found an empty list item. 934 if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1])) { 935 return $matches[0]; 936 } 937 938 $level = $matches[2][0] == '=' ? 1 : 2; 939 940 // ID attribute generation 941 $idAtt = $this->_generateIdFromHeaderValue($matches[1]); 942 943 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[1])."</h$level>"; 944 return "\n" . $this->hashBlock($block) . "\n\n"; 945 } 946 947 /** 948 * ATX header parsing callback 949 * @param array $matches 950 * @return string 951 */ 952 protected function _doHeaders_callback_atx($matches) { 953 // ID attribute generation 954 $idAtt = $this->_generateIdFromHeaderValue($matches[2]); 955 956 $level = strlen($matches[1]); 957 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[2])."</h$level>"; 958 return "\n" . $this->hashBlock($block) . "\n\n"; 959 } 960 961 /** 962 * If a header_id_func property is set, we can use it to automatically 963 * generate an id attribute. 964 * 965 * This method returns a string in the form id="foo", or an empty string 966 * otherwise. 967 * @param string $headerValue 968 * @return string 969 */ 970 protected function _generateIdFromHeaderValue($headerValue) { 971 if (!is_callable($this->header_id_func)) { 972 return ""; 973 } 974 975 $idValue = call_user_func($this->header_id_func, $headerValue); 976 if (!$idValue) { 977 return ""; 978 } 979 980 return ' id="' . $this->encodeAttribute($idValue) . '"'; 981 } 982 983 /** 984 * Form HTML ordered (numbered) and unordered (bulleted) lists. 985 * @param string $text 986 * @return string 987 */ 988 protected function doLists($text) { 989 $less_than_tab = $this->tab_width - 1; 990 991 // Re-usable patterns to match list item bullets and number markers: 992 $marker_ul_re = '[*+-]'; 993 $marker_ol_re = '\d+[\.]'; 994 995 $markers_relist = array( 996 $marker_ul_re => $marker_ol_re, 997 $marker_ol_re => $marker_ul_re, 998 ); 999 1000 foreach ($markers_relist as $marker_re => $other_marker_re) { 1001 // Re-usable pattern to match any entirel ul or ol list: 1002 $whole_list_re = ' 1003 ( # $1 = whole list 1004 ( # $2 1005 ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces 1006 ('.$marker_re.') # $4 = first list item marker 1007 [ ]+ 1008 ) 1009 (?s:.+?) 1010 ( # $5 1011 \z 1012 | 1013 \n{2,} 1014 (?=\S) 1015 (?! # Negative lookahead for another list item marker 1016 [ ]* 1017 '.$marker_re.'[ ]+ 1018 ) 1019 | 1020 (?= # Lookahead for another kind of list 1021 \n 1022 \3 # Must have the same indentation 1023 '.$other_marker_re.'[ ]+ 1024 ) 1025 ) 1026 ) 1027 '; // mx 1028 1029 // We use a different prefix before nested lists than top-level lists. 1030 //See extended comment in _ProcessListItems(). 1031 1032 if ($this->list_level) { 1033 $text = preg_replace_callback('{ 1034 ^ 1035 '.$whole_list_re.' 1036 }mx', 1037 array($this, '_doLists_callback'), $text); 1038 } else { 1039 $text = preg_replace_callback('{ 1040 (?:(?<=\n)\n|\A\n?) # Must eat the newline 1041 '.$whole_list_re.' 1042 }mx', 1043 array($this, '_doLists_callback'), $text); 1044 } 1045 } 1046 1047 return $text; 1048 } 1049 1050 /** 1051 * List parsing callback 1052 * @param array $matches 1053 * @return string 1054 */ 1055 protected function _doLists_callback($matches) { 1056 // Re-usable patterns to match list item bullets and number markers: 1057 $marker_ul_re = '[*+-]'; 1058 $marker_ol_re = '\d+[\.]'; 1059 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)"; 1060 $marker_ol_start_re = '[0-9]+'; 1061 1062 $list = $matches[1]; 1063 $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol"; 1064 1065 $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re ); 1066 1067 $list .= "\n"; 1068 $result = $this->processListItems($list, $marker_any_re); 1069 1070 $ol_start = 1; 1071 if ($this->enhanced_ordered_list) { 1072 // Get the start number for ordered list. 1073 if ($list_type == 'ol') { 1074 $ol_start_array = array(); 1075 $ol_start_check = preg_match("/$marker_ol_start_re/", $matches[4], $ol_start_array); 1076 if ($ol_start_check){ 1077 $ol_start = $ol_start_array[0]; 1078 } 1079 } 1080 } 1081 1082 if ($ol_start > 1 && $list_type == 'ol'){ 1083 $result = $this->hashBlock("<$list_type start=\"$ol_start\">\n" . $result . "</$list_type>"); 1084 } else { 1085 $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>"); 1086 } 1087 return "\n". $result ."\n\n"; 1088 } 1089 1090 /** 1091 * Nesting tracker for list levels 1092 */ 1093 protected int $list_level = 0; 1094 1095 /** 1096 * Process the contents of a single ordered or unordered list, splitting it 1097 * into individual list items. 1098 * @param string $list_str 1099 * @param string $marker_any_re 1100 * @return string 1101 */ 1102 protected function processListItems($list_str, $marker_any_re) { 1103 /** 1104 * The $this->list_level global keeps track of when we're inside a list. 1105 * Each time we enter a list, we increment it; when we leave a list, 1106 * we decrement. If it's zero, we're not in a list anymore. 1107 * 1108 * We do this because when we're not inside a list, we want to treat 1109 * something like this: 1110 * 1111 * I recommend upgrading to version 1112 * 8. Oops, now this line is treated 1113 * as a sub-list. 1114 * 1115 * As a single paragraph, despite the fact that the second line starts 1116 * with a digit-period-space sequence. 1117 * 1118 * Whereas when we're inside a list (or sub-list), that line will be 1119 * treated as the start of a sub-list. What a kludge, huh? This is 1120 * an aspect of Markdown's syntax that's hard to parse perfectly 1121 * without resorting to mind-reading. Perhaps the solution is to 1122 * change the syntax rules such that sub-lists must start with a 1123 * starting cardinal number; e.g. "1." or "a.". 1124 */ 1125 $this->list_level++; 1126 1127 // Trim trailing blank lines: 1128 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str); 1129 1130 $list_str = preg_replace_callback('{ 1131 (\n)? # leading line = $1 1132 (^[ ]*) # leading whitespace = $2 1133 ('.$marker_any_re.' # list marker and space = $3 1134 (?:[ ]+|(?=\n)) # space only required if item is not empty 1135 ) 1136 ((?s:.*?)) # list item text = $4 1137 (?:(\n+(?=\n))|\n) # tailing blank line = $5 1138 (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n)))) 1139 }xm', 1140 array($this, '_processListItems_callback'), $list_str); 1141 1142 $this->list_level--; 1143 return $list_str; 1144 } 1145 1146 /** 1147 * List item parsing callback 1148 * @param array $matches 1149 * @return string 1150 */ 1151 protected function _processListItems_callback($matches) { 1152 $item = $matches[4]; 1153 $leading_line =& $matches[1]; 1154 $leading_space =& $matches[2]; 1155 $marker_space = $matches[3]; 1156 $tailing_blank_line =& $matches[5]; 1157 1158 if ($leading_line || $tailing_blank_line || 1159 preg_match('/\n{2,}/', $item)) 1160 { 1161 // Replace marker with the appropriate whitespace indentation 1162 $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item; 1163 $item = $this->runBlockGamut($this->outdent($item)."\n"); 1164 } else { 1165 // Recursion for sub-lists: 1166 $item = $this->doLists($this->outdent($item)); 1167 $item = $this->formParagraphs($item, false); 1168 } 1169 1170 return "<li>" . $item . "</li>\n"; 1171 } 1172 1173 /** 1174 * Process Markdown `<pre><code>` blocks. 1175 * @param string $text 1176 * @return string 1177 */ 1178 protected function doCodeBlocks($text) { 1179 $text = preg_replace_callback('{ 1180 (?:\n\n|\A\n?) 1181 ( # $1 = the code block -- one or more lines, starting with a space/tab 1182 (?> 1183 [ ]{'.$this->tab_width.'} # Lines must start with a tab or a tab-width of spaces 1184 .*\n+ 1185 )+ 1186 ) 1187 ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc 1188 }xm', 1189 array($this, '_doCodeBlocks_callback'), $text); 1190 1191 return $text; 1192 } 1193 1194 /** 1195 * Code block parsing callback 1196 * @param array $matches 1197 * @return string 1198 */ 1199 protected function _doCodeBlocks_callback($matches) { 1200 $codeblock = $matches[1]; 1201 1202 $codeblock = $this->outdent($codeblock); 1203 if (is_callable($this->code_block_content_func)) { 1204 $codeblock = call_user_func($this->code_block_content_func, $codeblock, ""); 1205 } else { 1206 $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES); 1207 } 1208 1209 # trim leading newlines and trailing newlines 1210 $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock); 1211 1212 $codeblock = "<pre><code>$codeblock\n</code></pre>"; 1213 return "\n\n" . $this->hashBlock($codeblock) . "\n\n"; 1214 } 1215 1216 /** 1217 * Create a code span markup for $code. Called from handleSpanToken. 1218 * @param string $code 1219 * @return string 1220 */ 1221 protected function makeCodeSpan($code) { 1222 if (is_callable($this->code_span_content_func)) { 1223 $code = call_user_func($this->code_span_content_func, $code); 1224 } else { 1225 $code = htmlspecialchars(trim($code), ENT_NOQUOTES); 1226 } 1227 return $this->hashPart("<code>$code</code>"); 1228 } 1229 1230 /** 1231 * Define the emphasis operators with their regex matches 1232 * @var array 1233 */ 1234 protected array $em_relist = array( 1235 '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?![\.,:;]?\s)', 1236 '*' => '(?<![\s*])\*(?!\*)', 1237 '_' => '(?<![\s_])_(?!_)', 1238 ); 1239 1240 /** 1241 * Define the strong operators with their regex matches 1242 * @var array 1243 */ 1244 protected array $strong_relist = array( 1245 '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?![\.,:;]?\s)', 1246 '**' => '(?<![\s*])\*\*(?!\*)', 1247 '__' => '(?<![\s_])__(?!_)', 1248 ); 1249 1250 /** 1251 * Define the emphasis + strong operators with their regex matches 1252 * @var array 1253 */ 1254 protected array $em_strong_relist = array( 1255 '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?![\.,:;]?\s)', 1256 '***' => '(?<![\s*])\*\*\*(?!\*)', 1257 '___' => '(?<![\s_])___(?!_)', 1258 ); 1259 1260 /** 1261 * Container for prepared regular expressions 1262 */ 1263 protected ?array $em_strong_prepared_relist = null; 1264 1265 /** 1266 * Prepare regular expressions for searching emphasis tokens in any 1267 * context. 1268 * @return void 1269 */ 1270 protected function prepareItalicsAndBold() { 1271 foreach ($this->em_relist as $em => $em_re) { 1272 foreach ($this->strong_relist as $strong => $strong_re) { 1273 // Construct list of allowed token expressions. 1274 $token_relist = array(); 1275 if (isset($this->em_strong_relist["$em$strong"])) { 1276 $token_relist[] = $this->em_strong_relist["$em$strong"]; 1277 } 1278 $token_relist[] = $em_re; 1279 $token_relist[] = $strong_re; 1280 1281 // Construct master expression from list. 1282 $token_re = '{(' . implode('|', $token_relist) . ')}'; 1283 $this->em_strong_prepared_relist["$em$strong"] = $token_re; 1284 } 1285 } 1286 } 1287 1288 /** 1289 * Convert Markdown italics (emphasis) and bold (strong) to HTML 1290 * @param string $text 1291 * @return string 1292 */ 1293 protected function doItalicsAndBold($text) { 1294 if ($this->in_emphasis_processing) { 1295 return $text; // avoid reentrency 1296 } 1297 $this->in_emphasis_processing = true; 1298 1299 $token_stack = array(''); 1300 $text_stack = array(''); 1301 $em = ''; 1302 $strong = ''; 1303 $tree_char_em = false; 1304 1305 while (1) { 1306 // Get prepared regular expression for seraching emphasis tokens 1307 // in current context. 1308 $token_re = $this->em_strong_prepared_relist["$em$strong"]; 1309 1310 // Each loop iteration search for the next emphasis token. 1311 // Each token is then passed to handleSpanToken. 1312 $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE); 1313 $text_stack[0] .= $parts[0]; 1314 $token =& $parts[1]; 1315 $text =& $parts[2]; 1316 1317 if (empty($token)) { 1318 // Reached end of text span: empty stack without emitting. 1319 // any more emphasis. 1320 while ($token_stack[0]) { 1321 $text_stack[1] .= array_shift($token_stack); 1322 $text_stack[0] .= array_shift($text_stack); 1323 } 1324 break; 1325 } 1326 1327 $token_len = strlen($token); 1328 if ($tree_char_em) { 1329 // Reached closing marker while inside a three-char emphasis. 1330 if ($token_len == 3) { 1331 // Three-char closing marker, close em and strong. 1332 array_shift($token_stack); 1333 $span = array_shift($text_stack); 1334 $span = $this->runSpanGamut($span); 1335 $span = "<strong><em>$span</em></strong>"; 1336 $text_stack[0] .= $this->hashPart($span); 1337 $em = ''; 1338 $strong = ''; 1339 } else { 1340 // Other closing marker: close one em or strong and 1341 // change current token state to match the other 1342 $token_stack[0] = str_repeat($token[0], 3-$token_len); 1343 $tag = $token_len == 2 ? "strong" : "em"; 1344 $span = $text_stack[0]; 1345 $span = $this->runSpanGamut($span); 1346 $span = "<$tag>$span</$tag>"; 1347 $text_stack[0] = $this->hashPart($span); 1348 $$tag = ''; // $$tag stands for $em or $strong 1349 } 1350 $tree_char_em = false; 1351 } else if ($token_len == 3) { 1352 if ($em) { 1353 // Reached closing marker for both em and strong. 1354 // Closing strong marker: 1355 for ($i = 0; $i < 2; ++$i) { 1356 $shifted_token = array_shift($token_stack); 1357 $tag = strlen($shifted_token) == 2 ? "strong" : "em"; 1358 $span = array_shift($text_stack); 1359 $span = $this->runSpanGamut($span); 1360 $span = "<$tag>$span</$tag>"; 1361 $text_stack[0] .= $this->hashPart($span); 1362 $$tag = ''; // $$tag stands for $em or $strong 1363 } 1364 } else { 1365 // Reached opening three-char emphasis marker. Push on token 1366 // stack; will be handled by the special condition above. 1367 $em = $token[0]; 1368 $strong = "$em$em"; 1369 array_unshift($token_stack, $token); 1370 array_unshift($text_stack, ''); 1371 $tree_char_em = true; 1372 } 1373 } else if ($token_len == 2) { 1374 if ($strong) { 1375 // Unwind any dangling emphasis marker: 1376 if (strlen($token_stack[0]) == 1) { 1377 $text_stack[1] .= array_shift($token_stack); 1378 $text_stack[0] .= array_shift($text_stack); 1379 $em = ''; 1380 } 1381 // Closing strong marker: 1382 array_shift($token_stack); 1383 $span = array_shift($text_stack); 1384 $span = $this->runSpanGamut($span); 1385 $span = "<strong>$span</strong>"; 1386 $text_stack[0] .= $this->hashPart($span); 1387 $strong = ''; 1388 } else { 1389 array_unshift($token_stack, $token); 1390 array_unshift($text_stack, ''); 1391 $strong = $token; 1392 } 1393 } else { 1394 // Here $token_len == 1 1395 if ($em) { 1396 if (strlen($token_stack[0]) == 1) { 1397 // Closing emphasis marker: 1398 array_shift($token_stack); 1399 $span = array_shift($text_stack); 1400 $span = $this->runSpanGamut($span); 1401 $span = "<em>$span</em>"; 1402 $text_stack[0] .= $this->hashPart($span); 1403 $em = ''; 1404 } else { 1405 $text_stack[0] .= $token; 1406 } 1407 } else { 1408 array_unshift($token_stack, $token); 1409 array_unshift($text_stack, ''); 1410 $em = $token; 1411 } 1412 } 1413 } 1414 $this->in_emphasis_processing = false; 1415 return $text_stack[0]; 1416 } 1417 1418 /** 1419 * Parse Markdown blockquotes to HTML 1420 * @param string $text 1421 * @return string 1422 */ 1423 protected function doBlockQuotes($text) { 1424 $text = preg_replace_callback('/ 1425 ( # Wrap whole match in $1 1426 (?> 1427 ^[ ]*>[ ]? # ">" at the start of a line 1428 .+\n # rest of the first line 1429 (.+\n)* # subsequent consecutive lines 1430 \n* # blanks 1431 )+ 1432 ) 1433 /xm', 1434 array($this, '_doBlockQuotes_callback'), $text); 1435 1436 return $text; 1437 } 1438 1439 /** 1440 * Blockquote parsing callback 1441 * @param array $matches 1442 * @return string 1443 */ 1444 protected function _doBlockQuotes_callback($matches) { 1445 $bq = $matches[1]; 1446 // trim one level of quoting - trim whitespace-only lines 1447 $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq); 1448 $bq = $this->runBlockGamut($bq); // recurse 1449 1450 $bq = preg_replace('/^/m', " ", $bq); 1451 // These leading spaces cause problem with <pre> content, 1452 // so we need to fix that: 1453 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx', 1454 array($this, '_doBlockQuotes_callback2'), $bq); 1455 1456 return "\n" . $this->hashBlock("<blockquote>\n$bq\n</blockquote>") . "\n\n"; 1457 } 1458 1459 /** 1460 * Blockquote parsing callback 1461 * @param array $matches 1462 * @return string 1463 */ 1464 protected function _doBlockQuotes_callback2($matches) { 1465 $pre = $matches[1]; 1466 $pre = preg_replace('/^ /m', '', $pre); 1467 return $pre; 1468 } 1469 1470 /** 1471 * Parse paragraphs 1472 * 1473 * @param string $text String to process in paragraphs 1474 * @param boolean $wrap_in_p Whether paragraphs should be wrapped in <p> tags 1475 * @return string 1476 */ 1477 protected function formParagraphs($text, $wrap_in_p = true) { 1478 // Strip leading and trailing lines: 1479 $text = preg_replace('/\A\n+|\n+\z/', '', $text); 1480 1481 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY); 1482 1483 // Wrap <p> tags and unhashify HTML blocks 1484 foreach ($grafs as $key => $value) { 1485 if (!preg_match('/^B\x1A[0-9]+B$/', $value)) { 1486 // Is a paragraph. 1487 $value = $this->runSpanGamut($value); 1488 if ($wrap_in_p) { 1489 $value = preg_replace('/^([ ]*)/', "<p>", $value); 1490 $value .= "</p>"; 1491 } 1492 $grafs[$key] = $this->unhash($value); 1493 } else { 1494 // Is a block. 1495 // Modify elements of @grafs in-place... 1496 $graf = $value; 1497 $block = $this->html_hashes[$graf]; 1498 $graf = $block; 1499 // if (preg_match('{ 1500 // \A 1501 // ( # $1 = <div> tag 1502 // <div \s+ 1503 // [^>]* 1504 // \b 1505 // markdown\s*=\s* ([\'"]) # $2 = attr quote char 1506 // 1 1507 // \2 1508 // [^>]* 1509 // > 1510 // ) 1511 // ( # $3 = contents 1512 // .* 1513 // ) 1514 // (</div>) # $4 = closing tag 1515 // \z 1516 // }xs', $block, $matches)) 1517 // { 1518 // list(, $div_open, , $div_content, $div_close) = $matches; 1519 // 1520 // // We can't call Markdown(), because that resets the hash; 1521 // // that initialization code should be pulled into its own sub, though. 1522 // $div_content = $this->hashHTMLBlocks($div_content); 1523 // 1524 // // Run document gamut methods on the content. 1525 // foreach ($this->document_gamut as $method => $priority) { 1526 // $div_content = $this->$method($div_content); 1527 // } 1528 // 1529 // $div_open = preg_replace( 1530 // '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open); 1531 // 1532 // $graf = $div_open . "\n" . $div_content . "\n" . $div_close; 1533 // } 1534 $grafs[$key] = $graf; 1535 } 1536 } 1537 1538 return implode("\n\n", $grafs); 1539 } 1540 1541 /** 1542 * Encode text for a double-quoted HTML attribute. This function 1543 * is *not* suitable for attributes enclosed in single quotes. 1544 * @param string $text 1545 * @return string 1546 */ 1547 protected function encodeAttribute($text) { 1548 $text = $this->encodeAmpsAndAngles($text); 1549 $text = str_replace('"', '"', $text); 1550 return $text; 1551 } 1552 1553 /** 1554 * Encode text for a double-quoted HTML attribute containing a URL, 1555 * applying the URL filter if set. Also generates the textual 1556 * representation for the URL (removing mailto: or tel:) storing it in $text. 1557 * This function is *not* suitable for attributes enclosed in single quotes. 1558 * 1559 * @param string $url 1560 * @param string $text Passed by reference 1561 * @return string URL 1562 */ 1563 protected function encodeURLAttribute($url, &$text = null) { 1564 if (is_callable($this->url_filter_func)) { 1565 $url = call_user_func($this->url_filter_func, $url); 1566 } 1567 1568 if (preg_match('{^mailto:}i', $url)) { 1569 $url = $this->encodeEntityObfuscatedAttribute($url, $text, 7); 1570 } else if (preg_match('{^tel:}i', $url)) { 1571 $url = $this->encodeAttribute($url); 1572 $text = substr($url, 4); 1573 } else { 1574 $url = $this->encodeAttribute($url); 1575 $text = $url; 1576 } 1577 1578 return $url; 1579 } 1580 1581 /** 1582 * Smart processing for ampersands and angle brackets that need to 1583 * be encoded. Valid character entities are left alone unless the 1584 * no-entities mode is set. 1585 * @param string $text 1586 * @return string 1587 */ 1588 protected function encodeAmpsAndAngles($text) { 1589 if ($this->no_entities) { 1590 $text = str_replace('&', '&', $text); 1591 } else { 1592 // Ampersand-encoding based entirely on Nat Irons's Amputator 1593 // MT plugin: <http://bumppo.net/projects/amputator/> 1594 $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/', 1595 '&', $text); 1596 } 1597 // Encode remaining <'s 1598 $text = str_replace('<', '<', $text); 1599 1600 return $text; 1601 } 1602 1603 /** 1604 * Parse Markdown automatic links to anchor HTML tags 1605 * @param string $text 1606 * @return string 1607 */ 1608 protected function doAutoLinks($text) { 1609 $text = preg_replace_callback('{<((https?|ftp|dict|tel):[^\'">\s]+)>}i', 1610 array($this, '_doAutoLinks_url_callback'), $text); 1611 1612 // Email addresses: <address@domain.foo> 1613 $text = preg_replace_callback('{ 1614 < 1615 (?:mailto:)? 1616 ( 1617 (?: 1618 [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+ 1619 | 1620 ".*?" 1621 ) 1622 \@ 1623 (?: 1624 [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+ 1625 | 1626 \[[\d.a-fA-F:]+\] # IPv4 & IPv6 1627 ) 1628 ) 1629 > 1630 }xi', 1631 array($this, '_doAutoLinks_email_callback'), $text); 1632 1633 return $text; 1634 } 1635 1636 /** 1637 * Parse URL callback 1638 * @param array $matches 1639 * @return string 1640 */ 1641 protected function _doAutoLinks_url_callback($matches) { 1642 $url = $this->encodeURLAttribute($matches[1], $text); 1643 $link = "<a href=\"$url\">$text</a>"; 1644 return $this->hashPart($link); 1645 } 1646 1647 /** 1648 * Parse email address callback 1649 * @param array $matches 1650 * @return string 1651 */ 1652 protected function _doAutoLinks_email_callback($matches) { 1653 $addr = $matches[1]; 1654 $url = $this->encodeURLAttribute("mailto:$addr", $text); 1655 $link = "<a href=\"$url\">$text</a>"; 1656 return $this->hashPart($link); 1657 } 1658 1659 /** 1660 * Input: some text to obfuscate, e.g. "mailto:foo@example.com" 1661 * 1662 * Output: the same text but with most characters encoded as either a 1663 * decimal or hex entity, in the hopes of foiling most address 1664 * harvesting spam bots. E.g.: 1665 * 1666 * mailto:foo 1667 * @example.co 1668 * m 1669 * 1670 * Note: the additional output $tail is assigned the same value as the 1671 * ouput, minus the number of characters specified by $head_length. 1672 * 1673 * Based by a filter by Matthew Wickline, posted to BBEdit-Talk. 1674 * With some optimizations by Milian Wolff. Forced encoding of HTML 1675 * attribute special characters by Allan Odgaard. 1676 * 1677 * @param string $text 1678 * @param string $tail Passed by reference 1679 * @param integer $head_length 1680 * @return string 1681 */ 1682 protected function encodeEntityObfuscatedAttribute($text, &$tail = null, $head_length = 0) { 1683 if ($text == "") { 1684 return $tail = ""; 1685 } 1686 1687 $chars = preg_split('/(?<!^)(?!$)/', $text); 1688 $seed = (int)abs(crc32($text) / strlen($text)); // Deterministic seed. 1689 1690 foreach ($chars as $key => $char) { 1691 $ord = ord($char); 1692 // Ignore non-ascii chars. 1693 if ($ord < 128) { 1694 $r = ($seed * (1 + $key)) % 100; // Pseudo-random function. 1695 // roughly 10% raw, 45% hex, 45% dec 1696 // '@' *must* be encoded. I insist. 1697 // '"' and '>' have to be encoded inside the attribute 1698 if ($r > 90 && strpos('@"&>', $char) === false) { 1699 /* do nothing */ 1700 } else if ($r < 45) { 1701 $chars[$key] = '&#x'.dechex($ord).';'; 1702 } else { 1703 $chars[$key] = '&#'.$ord.';'; 1704 } 1705 } 1706 } 1707 1708 $text = implode('', $chars); 1709 $tail = $head_length ? implode('', array_slice($chars, $head_length)) : $text; 1710 1711 return $text; 1712 } 1713 1714 /** 1715 * Take the string $str and parse it into tokens, hashing embeded HTML, 1716 * escaped characters and handling code spans. 1717 * @param string $str 1718 * @return string 1719 */ 1720 protected function parseSpan($str) { 1721 $output = ''; 1722 1723 $span_re = '{ 1724 ( 1725 \\\\'.$this->escape_chars_re.' 1726 | 1727 (?<![`\\\\]) 1728 `+ # code span marker 1729 '.( $this->no_markup ? '' : ' 1730 | 1731 <!-- .*? --> # comment 1732 | 1733 <\?.*?\?> | <%.*?%> # processing instruction 1734 | 1735 <[!$]?[-a-zA-Z0-9:_]+ # regular tags 1736 (?> 1737 \s 1738 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')* 1739 )? 1740 > 1741 | 1742 <[-a-zA-Z0-9:_]+\s*/> # xml-style empty tag 1743 | 1744 </[-a-zA-Z0-9:_]+\s*> # closing tag 1745 ').' 1746 ) 1747 }xs'; 1748 1749 while (1) { 1750 // Each loop iteration seach for either the next tag, the next 1751 // openning code span marker, or the next escaped character. 1752 // Each token is then passed to handleSpanToken. 1753 $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE); 1754 1755 // Create token from text preceding tag. 1756 if ($parts[0] != "") { 1757 $output .= $parts[0]; 1758 } 1759 1760 // Check if we reach the end. 1761 if (isset($parts[1])) { 1762 $output .= $this->handleSpanToken($parts[1], $parts[2]); 1763 $str = $parts[2]; 1764 } else { 1765 break; 1766 } 1767 } 1768 1769 return $output; 1770 } 1771 1772 /** 1773 * Handle $token provided by parseSpan by determining its nature and 1774 * returning the corresponding value that should replace it. 1775 * @param string $token 1776 * @param string $str Passed by reference 1777 * @return string 1778 */ 1779 protected function handleSpanToken($token, &$str) { 1780 switch ($token[0]) { 1781 case "\\": 1782 return $this->hashPart("&#". ord($token[1]). ";"); 1783 case "`": 1784 // Search for end marker in remaining text. 1785 if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm', 1786 $str, $matches)) 1787 { 1788 $str = $matches[2]; 1789 $codespan = $this->makeCodeSpan($matches[1]); 1790 return $this->hashPart($codespan); 1791 } 1792 return $token; // Return as text since no ending marker found. 1793 default: 1794 return $this->hashPart($token); 1795 } 1796 } 1797 1798 /** 1799 * Remove one level of line-leading tabs or spaces 1800 * @param string $text 1801 * @return string 1802 */ 1803 protected function outdent($text) { 1804 return preg_replace('/^(\t|[ ]{1,' . $this->tab_width . '})/m', '', $text); 1805 } 1806 1807 1808 /** 1809 * String length function for detab. `_initDetab` will create a function to 1810 * handle UTF-8 if the default function does not exist. 1811 * can be a string or function 1812 */ 1813 protected $utf8_strlen = 'mb_strlen'; 1814 1815 /** 1816 * Replace tabs with the appropriate amount of spaces. 1817 * 1818 * For each line we separate the line in blocks delemited by tab characters. 1819 * Then we reconstruct every line by adding the appropriate number of space 1820 * between each blocks. 1821 * 1822 * @param string $text 1823 * @return string 1824 */ 1825 protected function detab($text) { 1826 $text = preg_replace_callback('/^.*\t.*$/m', 1827 array($this, '_detab_callback'), $text); 1828 1829 return $text; 1830 } 1831 1832 /** 1833 * Replace tabs callback 1834 * @param string $matches 1835 * @return string 1836 */ 1837 protected function _detab_callback($matches) { 1838 $line = $matches[0]; 1839 $strlen = $this->utf8_strlen; // strlen function for UTF-8. 1840 1841 // Split in blocks. 1842 $blocks = explode("\t", $line); 1843 // Add each blocks to the line. 1844 $line = $blocks[0]; 1845 unset($blocks[0]); // Do not add first block twice. 1846 foreach ($blocks as $block) { 1847 // Calculate amount of space, insert spaces, insert block. 1848 $amount = $this->tab_width - 1849 $strlen($line, 'UTF-8') % $this->tab_width; 1850 $line .= str_repeat(" ", $amount) . $block; 1851 } 1852 return $line; 1853 } 1854 1855 /** 1856 * Check for the availability of the function in the `utf8_strlen` property 1857 * (initially `mb_strlen`). If the function is not available, create a 1858 * function that will loosely count the number of UTF-8 characters with a 1859 * regular expression. 1860 * @return void 1861 */ 1862 protected function _initDetab() { 1863 1864 if (function_exists($this->utf8_strlen)) { 1865 return; 1866 } 1867 1868 $this->utf8_strlen = fn($text) => preg_match_all('/[\x00-\xBF]|[\xC0-\xFF][\x80-\xBF]*/', $text, $m); 1869 } 1870 1871 /** 1872 * Swap back in all the tags hashed by _HashHTMLBlocks. 1873 * @param string $text 1874 * @return string 1875 */ 1876 protected function unhash($text) { 1877 return preg_replace_callback('/(.)\x1A[0-9]+\1/', 1878 array($this, '_unhash_callback'), $text); 1879 } 1880 1881 /** 1882 * Unhashing callback 1883 * @param array $matches 1884 * @return string 1885 */ 1886 protected function _unhash_callback($matches) { 1887 return $this->html_hashes[$matches[0]]; 1888 } 1889 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body