Differences Between: [Versions 402 and 403]
1 <?php 2 3 declare(strict_types=1); 4 5 namespace OpenSpout\Reader\XLSX\Manager; 6 7 use DOMElement; 8 use OpenSpout\Common\Exception\IOException; 9 use OpenSpout\Reader\Exception\XMLProcessingException; 10 use OpenSpout\Reader\Wrapper\XMLReader; 11 use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory; 12 use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface; 13 use OpenSpout\Reader\XLSX\Options; 14 15 /** 16 * @internal 17 */ 18 final class SharedStringsManager 19 { 20 /** 21 * Definition of XML nodes names used to parse data. 22 */ 23 public const XML_NODE_SST = 'sst'; 24 public const XML_NODE_SI = 'si'; 25 public const XML_NODE_R = 'r'; 26 public const XML_NODE_T = 't'; 27 28 /** 29 * Definition of XML attributes used to parse data. 30 */ 31 public const XML_ATTRIBUTE_COUNT = 'count'; 32 public const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount'; 33 public const XML_ATTRIBUTE_XML_SPACE = 'xml:space'; 34 public const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve'; 35 36 /** @var string Path of the XLSX file being read */ 37 private string $filePath; 38 39 private Options $options; 40 41 /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */ 42 private WorkbookRelationshipsManager $workbookRelationshipsManager; 43 44 /** @var CachingStrategyFactory Factory to create shared strings caching strategies */ 45 private CachingStrategyFactory $cachingStrategyFactory; 46 47 /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ 48 private CachingStrategyInterface $cachingStrategy; 49 50 public function __construct( 51 string $filePath, 52 Options $options, 53 WorkbookRelationshipsManager $workbookRelationshipsManager, 54 CachingStrategyFactory $cachingStrategyFactory 55 ) { 56 $this->filePath = $filePath; 57 $this->options = $options; 58 $this->workbookRelationshipsManager = $workbookRelationshipsManager; 59 $this->cachingStrategyFactory = $cachingStrategyFactory; 60 } 61 62 /** 63 * Returns whether the XLSX file contains a shared strings XML file. 64 */ 65 public function hasSharedStrings(): bool 66 { 67 return $this->workbookRelationshipsManager->hasSharedStringsXMLFile(); 68 } 69 70 /** 71 * Builds an in-memory array containing all the shared strings of the sheet. 72 * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. 73 * It is then accessed by the sheet data, via the string index in the built table. 74 * 75 * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx 76 * 77 * The XML file can be really big with sheets containing a lot of data. That is why 78 * we need to use a XML reader that provides streaming like the XMLReader library. 79 * 80 * @throws \OpenSpout\Common\Exception\IOException If shared strings XML file can't be read 81 */ 82 public function extractSharedStrings(): void 83 { 84 $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath(); 85 $xmlReader = new XMLReader(); 86 $sharedStringIndex = 0; 87 88 if (false === $xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath)) { 89 throw new IOException('Could not open "'.$sharedStringsXMLFilePath.'".'); 90 } 91 92 try { 93 $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); 94 $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); 95 96 $xmlReader->readUntilNodeFound(self::XML_NODE_SI); 97 98 while (self::XML_NODE_SI === $xmlReader->getCurrentNodeName()) { 99 $this->processSharedStringsItem($xmlReader, $sharedStringIndex); 100 ++$sharedStringIndex; 101 102 // jump to the next '<si>' tag 103 $xmlReader->next(self::XML_NODE_SI); 104 } 105 106 $this->cachingStrategy->closeCache(); 107 } catch (XMLProcessingException $exception) { 108 throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); 109 } 110 111 $xmlReader->close(); 112 } 113 114 /** 115 * Returns the shared string at the given index, using the previously chosen caching strategy. 116 * 117 * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file 118 * 119 * @return string The shared string at the given index 120 * 121 * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index 122 */ 123 public function getStringAtIndex(int $sharedStringIndex): string 124 { 125 return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); 126 } 127 128 /** 129 * Destroys the cache, freeing memory and removing any created artifacts. 130 */ 131 public function cleanup(): void 132 { 133 if (isset($this->cachingStrategy)) { 134 $this->cachingStrategy->clearCache(); 135 } 136 } 137 138 /** 139 * Returns the shared strings unique count, as specified in <sst> tag. 140 * 141 * @param XMLReader $xmlReader XMLReader instance 142 * 143 * @return null|int Number of unique shared strings in the sharedStrings.xml file 144 * 145 * @throws \OpenSpout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read 146 */ 147 private function getSharedStringsUniqueCount(XMLReader $xmlReader): ?int 148 { 149 $xmlReader->next(self::XML_NODE_SST); 150 151 // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) 152 while (self::XML_NODE_SST === $xmlReader->getCurrentNodeName() && XMLReader::ELEMENT !== $xmlReader->nodeType) { 153 $xmlReader->read(); 154 } 155 156 $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT); 157 158 // some software do not add the "uniqueCount" attribute but only use the "count" one 159 // @see https://github.com/box/spout/issues/254 160 if (null === $uniqueCount) { 161 $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT); 162 } 163 164 return (null !== $uniqueCount) ? (int) $uniqueCount : null; 165 } 166 167 /** 168 * Returns the best shared strings caching strategy. 169 * 170 * @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown) 171 */ 172 private function getBestSharedStringsCachingStrategy(?int $sharedStringsUniqueCount): CachingStrategyInterface 173 { 174 return $this->cachingStrategyFactory 175 ->createBestCachingStrategy($sharedStringsUniqueCount, $this->options->getTempFolder()) 176 ; 177 } 178 179 /** 180 * Processes the shared strings item XML node which the given XML reader is positioned on. 181 * 182 * @param XMLReader $xmlReader XML Reader positioned on a "<si>" node 183 * @param int $sharedStringIndex Index of the processed shared strings item 184 */ 185 private function processSharedStringsItem(XMLReader $xmlReader, int $sharedStringIndex): void 186 { 187 $sharedStringValue = ''; 188 189 // NOTE: expand() will automatically decode all XML entities of the child nodes 190 $siNode = $xmlReader->expand(); 191 \assert($siNode instanceof DOMElement); 192 $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T); 193 194 foreach ($textNodes as $textNode) { 195 if ($this->shouldExtractTextNodeValue($textNode)) { 196 $textNodeValue = $textNode->nodeValue; 197 \assert(null !== $textNodeValue); 198 $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); 199 200 $sharedStringValue .= $shouldPreserveWhitespace 201 ? $textNodeValue 202 : trim($textNodeValue) 203 ; 204 } 205 } 206 207 $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex); 208 } 209 210 /** 211 * Not all text nodes' values must be extracted. 212 * Some text nodes are part of a node describing the pronunciation for instance. 213 * We'll only consider the nodes whose parents are "<si>" or "<r>". 214 * 215 * @param DOMElement $textNode Text node to check 216 * 217 * @return bool Whether the given text node's value must be extracted 218 */ 219 private function shouldExtractTextNodeValue(DOMElement $textNode): bool 220 { 221 $parentNode = $textNode->parentNode; 222 \assert(null !== $parentNode); 223 $parentTagName = $parentNode->localName; 224 225 return self::XML_NODE_SI === $parentTagName || self::XML_NODE_R === $parentTagName; 226 } 227 228 /** 229 * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. 230 * 231 * @param DOMElement $textNode The text node element (<t>) whose whitespace may be preserved 232 * 233 * @return bool Whether whitespace should be preserved 234 */ 235 private function shouldPreserveWhitespace(DOMElement $textNode): bool 236 { 237 $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE); 238 239 return self::XML_ATTRIBUTE_VALUE_PRESERVE === $spaceValue; 240 } 241 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body