Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.2.x will end 22 April 2024 (12 months).
  • Bug fixes for security issues in 4.2.x will end 7 October 2024 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.1.x is supported too.
   1  <?php
   2  
   3  declare(strict_types=1);
   4  
   5  namespace OpenSpout\Common\Helper\Escaper;
   6  
   7  /**
   8   * @internal
   9   */
  10  final class XLSX implements EscaperInterface
  11  {
  12      /** @var bool Whether the escaper has already been initialized */
  13      private bool $isAlreadyInitialized = false;
  14  
  15      /** @var string Regex pattern to detect control characters that need to be escaped */
  16      private string $escapableControlCharactersPattern;
  17  
  18      /** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */
  19      private array $controlCharactersEscapingMap;
  20  
  21      /** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */
  22      private array $controlCharactersEscapingReverseMap;
  23  
  24      /**
  25       * Escapes the given string to make it compatible with XLSX.
  26       *
  27       * @param string $string The string to escape
  28       *
  29       * @return string The escaped string
  30       */
  31      public function escape(string $string): string
  32      {
  33          $this->initIfNeeded();
  34  
  35          $escapedString = $this->escapeControlCharacters($string);
  36          // @NOTE: Using ENT_QUOTES as XML entities ('<', '>', '&') as well as
  37          //        single/double quotes (for XML attributes) need to be encoded.
  38          return htmlspecialchars($escapedString, ENT_QUOTES, 'UTF-8');
  39      }
  40  
  41      /**
  42       * Unescapes the given string to make it compatible with XLSX.
  43       *
  44       * @param string $string The string to unescape
  45       *
  46       * @return string The unescaped string
  47       */
  48      public function unescape(string $string): string
  49      {
  50          $this->initIfNeeded();
  51  
  52          // ==============
  53          // =   WARNING  =
  54          // ==============
  55          // It is assumed that the given string has already had its XML entities decoded.
  56          // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation).
  57          // Therefore there is no need to call "htmlspecialchars_decode()".
  58          return $this->unescapeControlCharacters($string);
  59      }
  60  
  61      /**
  62       * Initializes the control characters if not already done.
  63       */
  64      private function initIfNeeded(): void
  65      {
  66          if (!$this->isAlreadyInitialized) {
  67              $this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern();
  68              $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap();
  69              $this->controlCharactersEscapingReverseMap = array_flip($this->controlCharactersEscapingMap);
  70  
  71              $this->isAlreadyInitialized = true;
  72          }
  73      }
  74  
  75      /**
  76       * @return string Regex pattern containing all escapable control characters
  77       */
  78      private function getEscapableControlCharactersPattern(): string
  79      {
  80          // control characters values are from 0 to 1F (hex values) in the ASCII table
  81          // some characters should not be escaped though: "\t", "\r" and "\n".
  82          return '[\x00-\x08'.
  83                  // skipping "\t" (0x9) and "\n" (0xA)
  84                  '\x0B-\x0C'.
  85                  // skipping "\r" (0xD)
  86                  '\x0E-\x1F]';
  87      }
  88  
  89      /**
  90       * Builds the map containing control characters to be escaped
  91       * mapped to their escaped values.
  92       * "\t", "\r" and "\n" don't need to be escaped.
  93       *
  94       * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
  95       *
  96       * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
  97       *
  98       * @return string[]
  99       */
 100      private function getControlCharactersEscapingMap(): array
 101      {
 102          $controlCharactersEscapingMap = [];
 103  
 104          // control characters values are from 0 to 1F (hex values) in the ASCII table
 105          for ($charValue = 0x00; $charValue <= 0x1F; ++$charValue) {
 106              $character = \chr($charValue);
 107              if (1 === preg_match("/{$this->escapableControlCharactersPattern}/", $character)) {
 108                  $charHexValue = dechex($charValue);
 109                  $escapedChar = '_x'.sprintf('%04s', strtoupper($charHexValue)).'_';
 110                  $controlCharactersEscapingMap[$escapedChar] = $character;
 111              }
 112          }
 113  
 114          return $controlCharactersEscapingMap;
 115      }
 116  
 117      /**
 118       * Converts PHP control characters from the given string to OpenXML escaped control characters.
 119       *
 120       * Excel escapes control characters with _xHHHH_ and also escapes any
 121       * literal strings of that type by encoding the leading underscore.
 122       * So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_.
 123       *
 124       * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
 125       *
 126       * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
 127       *
 128       * @param string $string String to escape
 129       */
 130      private function escapeControlCharacters(string $string): string
 131      {
 132          $escapedString = $this->escapeEscapeCharacter($string);
 133  
 134          // if no control characters
 135          if (1 !== preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) {
 136              return $escapedString;
 137          }
 138  
 139          return preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function ($matches) {
 140              return $this->controlCharactersEscapingReverseMap[$matches[0]];
 141          }, $escapedString);
 142      }
 143  
 144      /**
 145       * Escapes the escape character: "_x0000_" -> "_x005F_x0000_".
 146       *
 147       * @param string $string String to escape
 148       *
 149       * @return string The escaped string
 150       */
 151      private function escapeEscapeCharacter(string $string): string
 152      {
 153          return preg_replace('/_(x[\dA-F]{4})_/', '_x005F_$1_', $string);
 154      }
 155  
 156      /**
 157       * Converts OpenXML escaped control characters from the given string to PHP control characters.
 158       *
 159       * Excel escapes control characters with _xHHHH_ and also escapes any
 160       * literal strings of that type by encoding the leading underscore.
 161       * So "_x0000_" -> "\0" and "_x005F_x0000_" -> "_x0000_"
 162       *
 163       * NOTE: the logic has been adapted from the XlsxWriter library (BSD License)
 164       *
 165       * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89
 166       *
 167       * @param string $string String to unescape
 168       */
 169      private function unescapeControlCharacters(string $string): string
 170      {
 171          $unescapedString = $string;
 172  
 173          foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) {
 174              // only unescape characters that don't contain the escaped escape character for now
 175              $unescapedString = preg_replace("/(?<!_x005F)({$escapedCharValue})/", $charValue, $unescapedString);
 176          }
 177  
 178          return $this->unescapeEscapeCharacter($unescapedString);
 179      }
 180  
 181      /**
 182       * Unecapes the escape character: "_x005F_x0000_" => "_x0000_".
 183       *
 184       * @param string $string String to unescape
 185       *
 186       * @return string The unescaped string
 187       */
 188      private function unescapeEscapeCharacter(string $string): string
 189      {
 190          return preg_replace('/_x005F(_x[\dA-F]{4}_)/', '$1', $string);
 191      }
 192  }