1 <?php 2 3 declare(strict_types=1); 4 5 namespace OpenSpout\Common\Helper\Escaper; 6 7 /** 8 * @internal 9 */ 10 final class XLSX implements EscaperInterface 11 { 12 /** @var bool Whether the escaper has already been initialized */ 13 private bool $isAlreadyInitialized = false; 14 15 /** @var string Regex pattern to detect control characters that need to be escaped */ 16 private string $escapableControlCharactersPattern; 17 18 /** @var string[] Map containing control characters to be escaped (key) and their escaped value (value) */ 19 private array $controlCharactersEscapingMap; 20 21 /** @var string[] Map containing control characters to be escaped (value) and their escaped value (key) */ 22 private array $controlCharactersEscapingReverseMap; 23 24 /** 25 * Escapes the given string to make it compatible with XLSX. 26 * 27 * @param string $string The string to escape 28 * 29 * @return string The escaped string 30 */ 31 public function escape(string $string): string 32 { 33 $this->initIfNeeded(); 34 35 $escapedString = $this->escapeControlCharacters($string); 36 // @NOTE: Using ENT_QUOTES as XML entities ('<', '>', '&') as well as 37 // single/double quotes (for XML attributes) need to be encoded. 38 return htmlspecialchars($escapedString, ENT_QUOTES, 'UTF-8'); 39 } 40 41 /** 42 * Unescapes the given string to make it compatible with XLSX. 43 * 44 * @param string $string The string to unescape 45 * 46 * @return string The unescaped string 47 */ 48 public function unescape(string $string): string 49 { 50 $this->initIfNeeded(); 51 52 // ============== 53 // = WARNING = 54 // ============== 55 // It is assumed that the given string has already had its XML entities decoded. 56 // This is true if the string is coming from a DOMNode (as DOMNode already decode XML entities on creation). 57 // Therefore there is no need to call "htmlspecialchars_decode()". 58 return $this->unescapeControlCharacters($string); 59 } 60 61 /** 62 * Initializes the control characters if not already done. 63 */ 64 private function initIfNeeded(): void 65 { 66 if (!$this->isAlreadyInitialized) { 67 $this->escapableControlCharactersPattern = $this->getEscapableControlCharactersPattern(); 68 $this->controlCharactersEscapingMap = $this->getControlCharactersEscapingMap(); 69 $this->controlCharactersEscapingReverseMap = array_flip($this->controlCharactersEscapingMap); 70 71 $this->isAlreadyInitialized = true; 72 } 73 } 74 75 /** 76 * @return string Regex pattern containing all escapable control characters 77 */ 78 private function getEscapableControlCharactersPattern(): string 79 { 80 // control characters values are from 0 to 1F (hex values) in the ASCII table 81 // some characters should not be escaped though: "\t", "\r" and "\n". 82 return '[\x00-\x08'. 83 // skipping "\t" (0x9) and "\n" (0xA) 84 '\x0B-\x0C'. 85 // skipping "\r" (0xD) 86 '\x0E-\x1F]'; 87 } 88 89 /** 90 * Builds the map containing control characters to be escaped 91 * mapped to their escaped values. 92 * "\t", "\r" and "\n" don't need to be escaped. 93 * 94 * NOTE: the logic has been adapted from the XlsxWriter library (BSD License) 95 * 96 * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89 97 * 98 * @return string[] 99 */ 100 private function getControlCharactersEscapingMap(): array 101 { 102 $controlCharactersEscapingMap = []; 103 104 // control characters values are from 0 to 1F (hex values) in the ASCII table 105 for ($charValue = 0x00; $charValue <= 0x1F; ++$charValue) { 106 $character = \chr($charValue); 107 if (1 === preg_match("/{$this->escapableControlCharactersPattern}/", $character)) { 108 $charHexValue = dechex($charValue); 109 $escapedChar = '_x'.sprintf('%04s', strtoupper($charHexValue)).'_'; 110 $controlCharactersEscapingMap[$escapedChar] = $character; 111 } 112 } 113 114 return $controlCharactersEscapingMap; 115 } 116 117 /** 118 * Converts PHP control characters from the given string to OpenXML escaped control characters. 119 * 120 * Excel escapes control characters with _xHHHH_ and also escapes any 121 * literal strings of that type by encoding the leading underscore. 122 * So "\0" -> _x0000_ and "_x0000_" -> _x005F_x0000_. 123 * 124 * NOTE: the logic has been adapted from the XlsxWriter library (BSD License) 125 * 126 * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89 127 * 128 * @param string $string String to escape 129 */ 130 private function escapeControlCharacters(string $string): string 131 { 132 $escapedString = $this->escapeEscapeCharacter($string); 133 134 // if no control characters 135 if (1 !== preg_match("/{$this->escapableControlCharactersPattern}/", $escapedString)) { 136 return $escapedString; 137 } 138 139 return preg_replace_callback("/({$this->escapableControlCharactersPattern})/", function ($matches) { 140 return $this->controlCharactersEscapingReverseMap[$matches[0]]; 141 }, $escapedString); 142 } 143 144 /** 145 * Escapes the escape character: "_x0000_" -> "_x005F_x0000_". 146 * 147 * @param string $string String to escape 148 * 149 * @return string The escaped string 150 */ 151 private function escapeEscapeCharacter(string $string): string 152 { 153 return preg_replace('/_(x[\dA-F]{4})_/', '_x005F_$1_', $string); 154 } 155 156 /** 157 * Converts OpenXML escaped control characters from the given string to PHP control characters. 158 * 159 * Excel escapes control characters with _xHHHH_ and also escapes any 160 * literal strings of that type by encoding the leading underscore. 161 * So "_x0000_" -> "\0" and "_x005F_x0000_" -> "_x0000_" 162 * 163 * NOTE: the logic has been adapted from the XlsxWriter library (BSD License) 164 * 165 * @see https://github.com/jmcnamara/XlsxWriter/blob/f1e610f29/xlsxwriter/sharedstrings.py#L89 166 * 167 * @param string $string String to unescape 168 */ 169 private function unescapeControlCharacters(string $string): string 170 { 171 $unescapedString = $string; 172 173 foreach ($this->controlCharactersEscapingMap as $escapedCharValue => $charValue) { 174 // only unescape characters that don't contain the escaped escape character for now 175 $unescapedString = preg_replace("/(?<!_x005F)({$escapedCharValue})/", $charValue, $unescapedString); 176 } 177 178 return $this->unescapeEscapeCharacter($unescapedString); 179 } 180 181 /** 182 * Unecapes the escape character: "_x005F_x0000_" => "_x0000_". 183 * 184 * @param string $string String to unescape 185 * 186 * @return string The unescaped string 187 */ 188 private function unescapeEscapeCharacter(string $string): string 189 { 190 return preg_replace('/_x005F(_x[\dA-F]{4}_)/', '$1', $string); 191 } 192 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body