Differences Between: [Versions 310 and 400] [Versions 39 and 400]
1 <?php 2 3 namespace Box\Spout\Reader\XLSX\Manager; 4 5 use Box\Spout\Common\Exception\IOException; 6 use Box\Spout\Reader\Exception\XMLProcessingException; 7 use Box\Spout\Reader\Wrapper\XMLReader; 8 use Box\Spout\Reader\XLSX\Creator\HelperFactory; 9 use Box\Spout\Reader\XLSX\Creator\InternalEntityFactory; 10 use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory; 11 use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface; 12 13 /** 14 * Class SharedStringsManager 15 * This class manages the shared strings defined in the associated XML file 16 */ 17 class SharedStringsManager 18 { 19 /** Definition of XML nodes names used to parse data */ 20 const XML_NODE_SST = 'sst'; 21 const XML_NODE_SI = 'si'; 22 const XML_NODE_R = 'r'; 23 const XML_NODE_T = 't'; 24 25 /** Definition of XML attributes used to parse data */ 26 const XML_ATTRIBUTE_COUNT = 'count'; 27 const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount'; 28 const XML_ATTRIBUTE_XML_SPACE = 'xml:space'; 29 const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve'; 30 31 /** @var string Path of the XLSX file being read */ 32 protected $filePath; 33 34 /** @var string Temporary folder where the temporary files to store shared strings will be stored */ 35 protected $tempFolder; 36 37 /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */ 38 protected $workbookRelationshipsManager; 39 40 /** @var InternalEntityFactory Factory to create entities */ 41 protected $entityFactory; 42 43 /** @var HelperFactory Factory to create helpers */ 44 protected $helperFactory; 45 46 /** @var CachingStrategyFactory Factory to create shared strings caching strategies */ 47 protected $cachingStrategyFactory; 48 49 /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ 50 protected $cachingStrategy; 51 52 /** 53 * @param string $filePath Path of the XLSX file being read 54 * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored 55 * @param WorkbookRelationshipsManager $workbookRelationshipsManager Helps retrieving workbook relationships 56 * @param InternalEntityFactory $entityFactory Factory to create entities 57 * @param HelperFactory $helperFactory Factory to create helpers 58 * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies 59 */ 60 public function __construct( 61 $filePath, 62 $tempFolder, 63 $workbookRelationshipsManager, 64 $entityFactory, 65 $helperFactory, 66 $cachingStrategyFactory 67 ) { 68 $this->filePath = $filePath; 69 $this->tempFolder = $tempFolder; 70 $this->workbookRelationshipsManager = $workbookRelationshipsManager; 71 $this->entityFactory = $entityFactory; 72 $this->helperFactory = $helperFactory; 73 $this->cachingStrategyFactory = $cachingStrategyFactory; 74 } 75 76 /** 77 * Returns whether the XLSX file contains a shared strings XML file 78 * 79 * @return bool 80 */ 81 public function hasSharedStrings() 82 { 83 return $this->workbookRelationshipsManager->hasSharedStringsXMLFile(); 84 } 85 86 /** 87 * Builds an in-memory array containing all the shared strings of the sheet. 88 * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. 89 * It is then accessed by the sheet data, via the string index in the built table. 90 * 91 * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx 92 * 93 * The XML file can be really big with sheets containing a lot of data. That is why 94 * we need to use a XML reader that provides streaming like the XMLReader library. 95 * 96 * @throws \Box\Spout\Common\Exception\IOException If shared strings XML file can't be read 97 * @return void 98 */ 99 public function extractSharedStrings() 100 { 101 $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath(); 102 $xmlReader = $this->entityFactory->createXMLReader(); 103 $sharedStringIndex = 0; 104 105 if ($xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath) === false) { 106 throw new IOException('Could not open "' . $sharedStringsXMLFilePath . '".'); 107 } 108 109 try { 110 $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); 111 $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); 112 113 $xmlReader->readUntilNodeFound(self::XML_NODE_SI); 114 115 while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SI) { 116 $this->processSharedStringsItem($xmlReader, $sharedStringIndex); 117 $sharedStringIndex++; 118 119 // jump to the next '<si>' tag 120 $xmlReader->next(self::XML_NODE_SI); 121 } 122 123 $this->cachingStrategy->closeCache(); 124 } catch (XMLProcessingException $exception) { 125 throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); 126 } 127 128 $xmlReader->close(); 129 } 130 131 /** 132 * Returns the shared strings unique count, as specified in <sst> tag. 133 * 134 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance 135 * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read 136 * @return int|null Number of unique shared strings in the sharedStrings.xml file 137 */ 138 protected function getSharedStringsUniqueCount($xmlReader) 139 { 140 $xmlReader->next(self::XML_NODE_SST); 141 142 // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) 143 while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) { 144 $xmlReader->read(); 145 } 146 147 $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT); 148 149 // some software do not add the "uniqueCount" attribute but only use the "count" one 150 // @see https://github.com/box/spout/issues/254 151 if ($uniqueCount === null) { 152 $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT); 153 } 154 155 return ($uniqueCount !== null) ? (int) $uniqueCount : null; 156 } 157 158 /** 159 * Returns the best shared strings caching strategy. 160 * 161 * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown) 162 * @return CachingStrategyInterface 163 */ 164 protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) 165 { 166 return $this->cachingStrategyFactory 167 ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory); 168 } 169 170 /** 171 * Processes the shared strings item XML node which the given XML reader is positioned on. 172 * 173 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node 174 * @param int $sharedStringIndex Index of the processed shared strings item 175 * @return void 176 */ 177 protected function processSharedStringsItem($xmlReader, $sharedStringIndex) 178 { 179 $sharedStringValue = ''; 180 181 // NOTE: expand() will automatically decode all XML entities of the child nodes 182 $siNode = $xmlReader->expand(); 183 $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T); 184 185 foreach ($textNodes as $textNode) { 186 if ($this->shouldExtractTextNodeValue($textNode)) { 187 $textNodeValue = $textNode->nodeValue; 188 $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); 189 190 $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : \trim($textNodeValue); 191 } 192 } 193 194 $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex); 195 } 196 197 /** 198 * Not all text nodes' values must be extracted. 199 * Some text nodes are part of a node describing the pronunciation for instance. 200 * We'll only consider the nodes whose parents are "<si>" or "<r>". 201 * 202 * @param \DOMElement $textNode Text node to check 203 * @return bool Whether the given text node's value must be extracted 204 */ 205 protected function shouldExtractTextNodeValue($textNode) 206 { 207 $parentTagName = $textNode->parentNode->localName; 208 209 return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R); 210 } 211 212 /** 213 * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. 214 * 215 * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved 216 * @return bool Whether whitespace should be preserved 217 */ 218 protected function shouldPreserveWhitespace($textNode) 219 { 220 $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE); 221 222 return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE); 223 } 224 225 /** 226 * Returns the shared string at the given index, using the previously chosen caching strategy. 227 * 228 * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file 229 * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index 230 * @return string The shared string at the given index 231 */ 232 public function getStringAtIndex($sharedStringIndex) 233 { 234 return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); 235 } 236 237 /** 238 * Destroys the cache, freeing memory and removing any created artifacts 239 * 240 * @return void 241 */ 242 public function cleanup() 243 { 244 if ($this->cachingStrategy) { 245 $this->cachingStrategy->clearCache(); 246 } 247 } 248 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body