Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.3.x will end 7 October 2024 (12 months).
  • Bug fixes for security issues in 4.3.x will end 21 April 2025 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.2.x is supported too.
   1  <?php
   2  
   3  /**
   4   * Injector that auto paragraphs text in the root node based on
   5   * double-spacing.
   6   * @todo Ensure all states are unit tested, including variations as well.
   7   * @todo Make a graph of the flow control for this Injector.
   8   */
   9  class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
  10  {
  11      /**
  12       * @type string
  13       */
  14      public $name = 'AutoParagraph';
  15  
  16      /**
  17       * @type array
  18       */
  19      public $needed = array('p');
  20  
  21      /**
  22       * @return HTMLPurifier_Token_Start
  23       */
  24      private function _pStart()
  25      {
  26          $par = new HTMLPurifier_Token_Start('p');
  27          $par->armor['MakeWellFormed_TagClosedError'] = true;
  28          return $par;
  29      }
  30  
  31      /**
  32       * @param HTMLPurifier_Token_Text $token
  33       */
  34      public function handleText(&$token)
  35      {
  36          $text = $token->data;
  37          // Does the current parent allow <p> tags?
  38          if ($this->allowsElement('p')) {
  39              if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
  40                  // Note that we have differing behavior when dealing with text
  41                  // in the anonymous root node, or a node inside the document.
  42                  // If the text as a double-newline, the treatment is the same;
  43                  // if it doesn't, see the next if-block if you're in the document.
  44  
  45                  $i = $nesting = null;
  46                  if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
  47                      // State 1.1: ...    ^ (whitespace, then document end)
  48                      //               ----
  49                      // This is a degenerate case
  50                  } else {
  51                      if (!$token->is_whitespace || $this->_isInline($current)) {
  52                          // State 1.2: PAR1
  53                          //            ----
  54  
  55                          // State 1.3: PAR1\n\nPAR2
  56                          //            ------------
  57  
  58                          // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
  59                          //                 ------------
  60                          $token = array($this->_pStart());
  61                          $this->_splitText($text, $token);
  62                      } else {
  63                          // State 1.5: \n<hr />
  64                          //            --
  65                      }
  66                  }
  67              } else {
  68                  // State 2:   <div>PAR1... (similar to 1.4)
  69                  //                 ----
  70  
  71                  // We're in an element that allows paragraph tags, but we're not
  72                  // sure if we're going to need them.
  73                  if ($this->_pLookAhead()) {
  74                      // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
  75                      //                 ----
  76                      // Note: This will always be the first child, since any
  77                      // previous inline element would have triggered this very
  78                      // same routine, and found the double newline. One possible
  79                      // exception would be a comment.
  80                      $token = array($this->_pStart(), $token);
  81                  } else {
  82                      // State 2.2.1: <div>PAR1<div>
  83                      //                   ----
  84  
  85                      // State 2.2.2: <div>PAR1<b>PAR1</b></div>
  86                      //                   ----
  87                  }
  88              }
  89              // Is the current parent a <p> tag?
  90          } elseif (!empty($this->currentNesting) &&
  91              $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {
  92              // State 3.1: ...<p>PAR1
  93              //                  ----
  94  
  95              // State 3.2: ...<p>PAR1\n\nPAR2
  96              //                  ------------
  97              $token = array();
  98              $this->_splitText($text, $token);
  99              // Abort!
 100          } else {
 101              // State 4.1: ...<b>PAR1
 102              //                  ----
 103  
 104              // State 4.2: ...<b>PAR1\n\nPAR2
 105              //                  ------------
 106          }
 107      }
 108  
 109      /**
 110       * @param HTMLPurifier_Token $token
 111       */
 112      public function handleElement(&$token)
 113      {
 114          // We don't have to check if we're already in a <p> tag for block
 115          // tokens, because the tag would have been autoclosed by MakeWellFormed.
 116          if ($this->allowsElement('p')) {
 117              if (!empty($this->currentNesting)) {
 118                  if ($this->_isInline($token)) {
 119                      // State 1: <div>...<b>
 120                      //                  ---
 121                      // Check if this token is adjacent to the parent token
 122                      // (seek backwards until token isn't whitespace)
 123                      $i = null;
 124                      $this->backward($i, $prev);
 125  
 126                      if (!$prev instanceof HTMLPurifier_Token_Start) {
 127                          // Token wasn't adjacent
 128                          if ($prev instanceof HTMLPurifier_Token_Text &&
 129                              substr($prev->data, -2) === "\n\n"
 130                          ) {
 131                              // State 1.1.4: <div><p>PAR1</p>\n\n<b>
 132                              //                                  ---
 133                              // Quite frankly, this should be handled by splitText
 134                              $token = array($this->_pStart(), $token);
 135                          } else {
 136                              // State 1.1.1: <div><p>PAR1</p><b>
 137                              //                              ---
 138                              // State 1.1.2: <div><br /><b>
 139                              //                         ---
 140                              // State 1.1.3: <div>PAR<b>
 141                              //                      ---
 142                          }
 143                      } else {
 144                          // State 1.2.1: <div><b>
 145                          //                   ---
 146                          // Lookahead to see if <p> is needed.
 147                          if ($this->_pLookAhead()) {
 148                              // State 1.3.1: <div><b>PAR1\n\nPAR2
 149                              //                   ---
 150                              $token = array($this->_pStart(), $token);
 151                          } else {
 152                              // State 1.3.2: <div><b>PAR1</b></div>
 153                              //                   ---
 154  
 155                              // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
 156                              //                   ---
 157                          }
 158                      }
 159                  } else {
 160                      // State 2.3: ...<div>
 161                      //               -----
 162                  }
 163              } else {
 164                  if ($this->_isInline($token)) {
 165                      // State 3.1: <b>
 166                      //            ---
 167                      // This is where the {p} tag is inserted, not reflected in
 168                      // inputTokens yet, however.
 169                      $token = array($this->_pStart(), $token);
 170                  } else {
 171                      // State 3.2: <div>
 172                      //            -----
 173                  }
 174  
 175                  $i = null;
 176                  if ($this->backward($i, $prev)) {
 177                      if (!$prev instanceof HTMLPurifier_Token_Text) {
 178                          // State 3.1.1: ...</p>{p}<b>
 179                          //                        ---
 180                          // State 3.2.1: ...</p><div>
 181                          //                     -----
 182                          if (!is_array($token)) {
 183                              $token = array($token);
 184                          }
 185                          array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
 186                      } else {
 187                          // State 3.1.2: ...</p>\n\n{p}<b>
 188                          //                            ---
 189                          // State 3.2.2: ...</p>\n\n<div>
 190                          //                         -----
 191                          // Note: PAR<ELEM> cannot occur because PAR would have been
 192                          // wrapped in <p> tags.
 193                      }
 194                  }
 195              }
 196          } else {
 197              // State 2.2: <ul><li>
 198              //                ----
 199              // State 2.4: <p><b>
 200              //               ---
 201          }
 202      }
 203  
 204      /**
 205       * Splits up a text in paragraph tokens and appends them
 206       * to the result stream that will replace the original
 207       * @param string $data String text data that will be processed
 208       *    into paragraphs
 209       * @param HTMLPurifier_Token[] $result Reference to array of tokens that the
 210       *    tags will be appended onto
 211       */
 212      private function _splitText($data, &$result)
 213      {
 214          $raw_paragraphs = explode("\n\n", $data);
 215          $paragraphs = array(); // without empty paragraphs
 216          $needs_start = false;
 217          $needs_end = false;
 218  
 219          $c = count($raw_paragraphs);
 220          if ($c == 1) {
 221              // There were no double-newlines, abort quickly. In theory this
 222              // should never happen.
 223              $result[] = new HTMLPurifier_Token_Text($data);
 224              return;
 225          }
 226          for ($i = 0; $i < $c; $i++) {
 227              $par = $raw_paragraphs[$i];
 228              if (trim($par) !== '') {
 229                  $paragraphs[] = $par;
 230              } else {
 231                  if ($i == 0) {
 232                      // Double newline at the front
 233                      if (empty($result)) {
 234                          // The empty result indicates that the AutoParagraph
 235                          // injector did not add any start paragraph tokens.
 236                          // This means that we have been in a paragraph for
 237                          // a while, and the newline means we should start a new one.
 238                          $result[] = new HTMLPurifier_Token_End('p');
 239                          $result[] = new HTMLPurifier_Token_Text("\n\n");
 240                          // However, the start token should only be added if
 241                          // there is more processing to be done (i.e. there are
 242                          // real paragraphs in here). If there are none, the
 243                          // next start paragraph tag will be handled by the
 244                          // next call to the injector
 245                          $needs_start = true;
 246                      } else {
 247                          // We just started a new paragraph!
 248                          // Reinstate a double-newline for presentation's sake, since
 249                          // it was in the source code.
 250                          array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
 251                      }
 252                  } elseif ($i + 1 == $c) {
 253                      // Double newline at the end
 254                      // There should be a trailing </p> when we're finally done.
 255                      $needs_end = true;
 256                  }
 257              }
 258          }
 259  
 260          // Check if this was just a giant blob of whitespace. Move this earlier,
 261          // perhaps?
 262          if (empty($paragraphs)) {
 263              return;
 264          }
 265  
 266          // Add the start tag indicated by \n\n at the beginning of $data
 267          if ($needs_start) {
 268              $result[] = $this->_pStart();
 269          }
 270  
 271          // Append the paragraphs onto the result
 272          foreach ($paragraphs as $par) {
 273              $result[] = new HTMLPurifier_Token_Text($par);
 274              $result[] = new HTMLPurifier_Token_End('p');
 275              $result[] = new HTMLPurifier_Token_Text("\n\n");
 276              $result[] = $this->_pStart();
 277          }
 278  
 279          // Remove trailing start token; Injector will handle this later if
 280          // it was indeed needed. This prevents from needing to do a lookahead,
 281          // at the cost of a lookbehind later.
 282          array_pop($result);
 283  
 284          // If there is no need for an end tag, remove all of it and let
 285          // MakeWellFormed close it later.
 286          if (!$needs_end) {
 287              array_pop($result); // removes \n\n
 288              array_pop($result); // removes </p>
 289          }
 290      }
 291  
 292      /**
 293       * Returns true if passed token is inline (and, ergo, allowed in
 294       * paragraph tags)
 295       * @param HTMLPurifier_Token $token
 296       * @return bool
 297       */
 298      private function _isInline($token)
 299      {
 300          return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
 301      }
 302  
 303      /**
 304       * Looks ahead in the token list and determines whether or not we need
 305       * to insert a <p> tag.
 306       * @return bool
 307       */
 308      private function _pLookAhead()
 309      {
 310          if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
 311              $nesting = 1;
 312          } else {
 313              $nesting = 0;
 314          }
 315          $ok = false;
 316          $i = null;
 317          while ($this->forwardUntilEndToken($i, $current, $nesting)) {
 318              $result = $this->_checkNeedsP($current);
 319              if ($result !== null) {
 320                  $ok = $result;
 321                  break;
 322              }
 323          }
 324          return $ok;
 325      }
 326  
 327      /**
 328       * Determines if a particular token requires an earlier inline token
 329       * to get a paragraph. This should be used with _forwardUntilEndToken
 330       * @param HTMLPurifier_Token $current
 331       * @return bool
 332       */
 333      private function _checkNeedsP($current)
 334      {
 335          if ($current instanceof HTMLPurifier_Token_Start) {
 336              if (!$this->_isInline($current)) {
 337                  // <div>PAR1<div>
 338                  //      ----
 339                  // Terminate early, since we hit a block element
 340                  return false;
 341              }
 342          } elseif ($current instanceof HTMLPurifier_Token_Text) {
 343              if (strpos($current->data, "\n\n") !== false) {
 344                  // <div>PAR1<b>PAR1\n\nPAR2
 345                  //      ----
 346                  return true;
 347              } else {
 348                  // <div>PAR1<b>PAR1...
 349                  //      ----
 350              }
 351          }
 352          return null;
 353      }
 354  }
 355  
 356  // vim: et sw=4 sts=4