Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.3.x will end 7 October 2024 (12 months).
  • Bug fixes for security issues in 4.3.x will end 21 April 2025 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.2.x is supported too.
   1  <?php
   2  
   3  // why is this a top level function? Because PHP 5.2.0 doesn't seem to
   4  // understand how to interpret this filter if it's a static method.
   5  // It's all really silly, but if we go this route it might be reasonable
   6  // to coalesce all of these methods into one.
   7  function htmlpurifier_filter_extractstyleblocks_muteerrorhandler()
   8  {
   9  }
  10  
  11  /**
  12   * This filter extracts <style> blocks from input HTML, cleans them up
  13   * using CSSTidy, and then places them in $purifier->context->get('StyleBlocks')
  14   * so they can be used elsewhere in the document.
  15   *
  16   * @note
  17   *      See tests/HTMLPurifier/Filter/ExtractStyleBlocksTest.php for
  18   *      sample usage.
  19   *
  20   * @note
  21   *      This filter can also be used on stylesheets not included in the
  22   *      document--something purists would probably prefer. Just directly
  23   *      call HTMLPurifier_Filter_ExtractStyleBlocks->cleanCSS()
  24   */
  25  class HTMLPurifier_Filter_ExtractStyleBlocks extends HTMLPurifier_Filter
  26  {
  27      /**
  28       * @type string
  29       */
  30      public $name = 'ExtractStyleBlocks';
  31  
  32      /**
  33       * @type array
  34       */
  35      private $_styleMatches = array();
  36  
  37      /**
  38       * @type csstidy
  39       */
  40      private $_tidy;
  41  
  42      /**
  43       * @type HTMLPurifier_AttrDef_HTML_ID
  44       */
  45      private $_id_attrdef;
  46  
  47      /**
  48       * @type HTMLPurifier_AttrDef_CSS_Ident
  49       */
  50      private $_class_attrdef;
  51  
  52      /**
  53       * @type HTMLPurifier_AttrDef_Enum
  54       */
  55      private $_enum_attrdef;
  56  
  57      public function __construct()
  58      {
  59          $this->_tidy = new csstidy();
  60          $this->_tidy->set_cfg('lowercase_s', false);
  61          $this->_id_attrdef = new HTMLPurifier_AttrDef_HTML_ID(true);
  62          $this->_class_attrdef = new HTMLPurifier_AttrDef_CSS_Ident();
  63          $this->_enum_attrdef = new HTMLPurifier_AttrDef_Enum(
  64              array(
  65                  'first-child',
  66                  'link',
  67                  'visited',
  68                  'active',
  69                  'hover',
  70                  'focus'
  71              )
  72          );
  73      }
  74  
  75      /**
  76       * Save the contents of CSS blocks to style matches
  77       * @param array $matches preg_replace style $matches array
  78       */
  79      protected function styleCallback($matches)
  80      {
  81          $this->_styleMatches[] = $matches[1];
  82      }
  83  
  84      /**
  85       * Removes inline <style> tags from HTML, saves them for later use
  86       * @param string $html
  87       * @param HTMLPurifier_Config $config
  88       * @param HTMLPurifier_Context $context
  89       * @return string
  90       * @todo Extend to indicate non-text/css style blocks
  91       */
  92      public function preFilter($html, $config, $context)
  93      {
  94          $tidy = $config->get('Filter.ExtractStyleBlocks.TidyImpl');
  95          if ($tidy !== null) {
  96              $this->_tidy = $tidy;
  97          }
  98          // NB: this must be NON-greedy because if we have
  99          // <style>foo</style>  <style>bar</style>
 100          // we must not grab foo</style>  <style>bar
 101          $html = preg_replace_callback('#<style(?:\s.*)?>(.*)<\/style>#isU', array($this, 'styleCallback'), $html);
 102          $style_blocks = $this->_styleMatches;
 103          $this->_styleMatches = array(); // reset
 104          $context->register('StyleBlocks', $style_blocks); // $context must not be reused
 105          if ($this->_tidy) {
 106              foreach ($style_blocks as &$style) {
 107                  $style = $this->cleanCSS($style, $config, $context);
 108              }
 109          }
 110          return $html;
 111      }
 112  
 113      /**
 114       * Takes CSS (the stuff found in <style>) and cleans it.
 115       * @warning Requires CSSTidy <http://csstidy.sourceforge.net/>
 116       * @param string $css CSS styling to clean
 117       * @param HTMLPurifier_Config $config
 118       * @param HTMLPurifier_Context $context
 119       * @throws HTMLPurifier_Exception
 120       * @return string Cleaned CSS
 121       */
 122      public function cleanCSS($css, $config, $context)
 123      {
 124          // prepare scope
 125          $scope = $config->get('Filter.ExtractStyleBlocks.Scope');
 126          if ($scope !== null) {
 127              $scopes = array_map('trim', explode(',', $scope));
 128          } else {
 129              $scopes = array();
 130          }
 131          // remove comments from CSS
 132          $css = trim($css);
 133          if (strncmp('<!--', $css, 4) === 0) {
 134              $css = substr($css, 4);
 135          }
 136          if (strlen($css) > 3 && substr($css, -3) == '-->') {
 137              $css = substr($css, 0, -3);
 138          }
 139          $css = trim($css);
 140          set_error_handler('htmlpurifier_filter_extractstyleblocks_muteerrorhandler');
 141          $this->_tidy->parse($css);
 142          restore_error_handler();
 143          $css_definition = $config->getDefinition('CSS');
 144          $html_definition = $config->getDefinition('HTML');
 145          $new_css = array();
 146          foreach ($this->_tidy->css as $k => $decls) {
 147              // $decls are all CSS declarations inside an @ selector
 148              $new_decls = array();
 149              foreach ($decls as $selector => $style) {
 150                  $selector = trim($selector);
 151                  if ($selector === '') {
 152                      continue;
 153                  } // should not happen
 154                  // Parse the selector
 155                  // Here is the relevant part of the CSS grammar:
 156                  //
 157                  // ruleset
 158                  //   : selector [ ',' S* selector ]* '{' ...
 159                  // selector
 160                  //   : simple_selector [ combinator selector | S+ [ combinator? selector ]? ]?
 161                  // combinator
 162                  //   : '+' S*
 163                  //   : '>' S*
 164                  // simple_selector
 165                  //   : element_name [ HASH | class | attrib | pseudo ]*
 166                  //   | [ HASH | class | attrib | pseudo ]+
 167                  // element_name
 168                  //   : IDENT | '*'
 169                  //   ;
 170                  // class
 171                  //   : '.' IDENT
 172                  //   ;
 173                  // attrib
 174                  //   : '[' S* IDENT S* [ [ '=' | INCLUDES | DASHMATCH ] S*
 175                  //     [ IDENT | STRING ] S* ]? ']'
 176                  //   ;
 177                  // pseudo
 178                  //   : ':' [ IDENT | FUNCTION S* [IDENT S*]? ')' ]
 179                  //   ;
 180                  //
 181                  // For reference, here are the relevant tokens:
 182                  //
 183                  // HASH         #{name}
 184                  // IDENT        {ident}
 185                  // INCLUDES     ==
 186                  // DASHMATCH    |=
 187                  // STRING       {string}
 188                  // FUNCTION     {ident}\(
 189                  //
 190                  // And the lexical scanner tokens
 191                  //
 192                  // name         {nmchar}+
 193                  // nmchar       [_a-z0-9-]|{nonascii}|{escape}
 194                  // nonascii     [\240-\377]
 195                  // escape       {unicode}|\\[^\r\n\f0-9a-f]
 196                  // unicode      \\{h}}{1,6}(\r\n|[ \t\r\n\f])?
 197                  // ident        -?{nmstart}{nmchar*}
 198                  // nmstart      [_a-z]|{nonascii}|{escape}
 199                  // string       {string1}|{string2}
 200                  // string1      \"([^\n\r\f\\"]|\\{nl}|{escape})*\"
 201                  // string2      \'([^\n\r\f\\"]|\\{nl}|{escape})*\'
 202                  //
 203                  // We'll implement a subset (in order to reduce attack
 204                  // surface); in particular:
 205                  //
 206                  //      - No Unicode support
 207                  //      - No escapes support
 208                  //      - No string support (by proxy no attrib support)
 209                  //      - element_name is matched against allowed
 210                  //        elements (some people might find this
 211                  //        annoying...)
 212                  //      - Pseudo-elements one of :first-child, :link,
 213                  //        :visited, :active, :hover, :focus
 214  
 215                  // handle ruleset
 216                  $selectors = array_map('trim', explode(',', $selector));
 217                  $new_selectors = array();
 218                  foreach ($selectors as $sel) {
 219                      // split on +, > and spaces
 220                      $basic_selectors = preg_split('/\s*([+> ])\s*/', $sel, -1, PREG_SPLIT_DELIM_CAPTURE);
 221                      // even indices are chunks, odd indices are
 222                      // delimiters
 223                      $nsel = null;
 224                      $delim = null; // guaranteed to be non-null after
 225                      // two loop iterations
 226                      for ($i = 0, $c = count($basic_selectors); $i < $c; $i++) {
 227                          $x = $basic_selectors[$i];
 228                          if ($i % 2) {
 229                              // delimiter
 230                              if ($x === ' ') {
 231                                  $delim = ' ';
 232                              } else {
 233                                  $delim = ' ' . $x . ' ';
 234                              }
 235                          } else {
 236                              // simple selector
 237                              $components = preg_split('/([#.:])/', $x, -1, PREG_SPLIT_DELIM_CAPTURE);
 238                              $sdelim = null;
 239                              $nx = null;
 240                              for ($j = 0, $cc = count($components); $j < $cc; $j++) {
 241                                  $y = $components[$j];
 242                                  if ($j === 0) {
 243                                      if ($y === '*' || isset($html_definition->info[$y = strtolower($y)])) {
 244                                          $nx = $y;
 245                                      } else {
 246                                          // $nx stays null; this matters
 247                                          // if we don't manage to find
 248                                          // any valid selector content,
 249                                          // in which case we ignore the
 250                                          // outer $delim
 251                                      }
 252                                  } elseif ($j % 2) {
 253                                      // set delimiter
 254                                      $sdelim = $y;
 255                                  } else {
 256                                      $attrdef = null;
 257                                      if ($sdelim === '#') {
 258                                          $attrdef = $this->_id_attrdef;
 259                                      } elseif ($sdelim === '.') {
 260                                          $attrdef = $this->_class_attrdef;
 261                                      } elseif ($sdelim === ':') {
 262                                          $attrdef = $this->_enum_attrdef;
 263                                      } else {
 264                                          throw new HTMLPurifier_Exception('broken invariant sdelim and preg_split');
 265                                      }
 266                                      $r = $attrdef->validate($y, $config, $context);
 267                                      if ($r !== false) {
 268                                          if ($r !== true) {
 269                                              $y = $r;
 270                                          }
 271                                          if ($nx === null) {
 272                                              $nx = '';
 273                                          }
 274                                          $nx .= $sdelim . $y;
 275                                      }
 276                                  }
 277                              }
 278                              if ($nx !== null) {
 279                                  if ($nsel === null) {
 280                                      $nsel = $nx;
 281                                  } else {
 282                                      $nsel .= $delim . $nx;
 283                                  }
 284                              } else {
 285                                  // delimiters to the left of invalid
 286                                  // basic selector ignored
 287                              }
 288                          }
 289                      }
 290                      if ($nsel !== null) {
 291                          if (!empty($scopes)) {
 292                              foreach ($scopes as $s) {
 293                                  $new_selectors[] = "$s $nsel";
 294                              }
 295                          } else {
 296                              $new_selectors[] = $nsel;
 297                          }
 298                      }
 299                  }
 300                  if (empty($new_selectors)) {
 301                      continue;
 302                  }
 303                  $selector = implode(', ', $new_selectors);
 304                  foreach ($style as $name => $value) {
 305                      if (!isset($css_definition->info[$name])) {
 306                          unset($style[$name]);
 307                          continue;
 308                      }
 309                      $def = $css_definition->info[$name];
 310                      $ret = $def->validate($value, $config, $context);
 311                      if ($ret === false) {
 312                          unset($style[$name]);
 313                      } else {
 314                          $style[$name] = $ret;
 315                      }
 316                  }
 317                  $new_decls[$selector] = $style;
 318              }
 319              $new_css[$k] = $new_decls;
 320          }
 321          // remove stuff that shouldn't be used, could be reenabled
 322          // after security risks are analyzed
 323          $this->_tidy->css = $new_css;
 324          $this->_tidy->import = array();
 325          $this->_tidy->charset = null;
 326          $this->_tidy->namespace = null;
 327          $css = $this->_tidy->print->plain();
 328          // we are going to escape any special characters <>& to ensure
 329          // that no funny business occurs (i.e. </style> in a font-family prop).
 330          if ($config->get('Filter.ExtractStyleBlocks.Escaping')) {
 331              $css = str_replace(
 332                  array('<', '>', '&'),
 333                  array('\3C ', '\3E ', '\26 '),
 334                  $css
 335              );
 336          }
 337          return $css;
 338      }
 339  }
 340  
 341  // vim: et sw=4 sts=4