See Release Notes
Long Term Support Release
Differences Between: [Versions 39 and 311] [Versions 39 and 400] [Versions 39 and 401]
1 <?php 2 3 namespace Box\Spout\Reader\XLSX\Manager; 4 5 use Box\Spout\Common\Exception\IOException; 6 use Box\Spout\Reader\Exception\XMLProcessingException; 7 use Box\Spout\Reader\Wrapper\XMLReader; 8 use Box\Spout\Reader\XLSX\Creator\HelperFactory; 9 use Box\Spout\Reader\XLSX\Creator\InternalEntityFactory; 10 use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory; 11 use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface; 12 13 /** 14 * Class SharedStringsManager 15 * This class manages the shared strings defined in the associated XML file 16 */ 17 class SharedStringsManager 18 { 19 /** Main namespace for the sharedStrings.xml file */ 20 const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'; 21 22 /** Definition of XML nodes names used to parse data */ 23 const XML_NODE_SST = 'sst'; 24 const XML_NODE_SI = 'si'; 25 const XML_NODE_R = 'r'; 26 const XML_NODE_T = 't'; 27 28 /** Definition of XML attributes used to parse data */ 29 const XML_ATTRIBUTE_COUNT = 'count'; 30 const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount'; 31 const XML_ATTRIBUTE_XML_SPACE = 'xml:space'; 32 const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve'; 33 34 /** @var string Path of the XLSX file being read */ 35 protected $filePath; 36 37 /** @var string Temporary folder where the temporary files to store shared strings will be stored */ 38 protected $tempFolder; 39 40 /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */ 41 protected $workbookRelationshipsManager; 42 43 /** @var InternalEntityFactory Factory to create entities */ 44 protected $entityFactory; 45 46 /** @var HelperFactory $helperFactory Factory to create helpers */ 47 protected $helperFactory; 48 49 /** @var CachingStrategyFactory Factory to create shared strings caching strategies */ 50 protected $cachingStrategyFactory; 51 52 /** @var CachingStrategyInterface The best caching strategy for storing shared strings */ 53 protected $cachingStrategy; 54 55 /** 56 * @param string $filePath Path of the XLSX file being read 57 * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored 58 * @param WorkbookRelationshipsManager $workbookRelationshipsManager Helps retrieving workbook relationships 59 * @param InternalEntityFactory $entityFactory Factory to create entities 60 * @param HelperFactory $helperFactory Factory to create helpers 61 * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies 62 */ 63 public function __construct( 64 $filePath, 65 $tempFolder, 66 $workbookRelationshipsManager, 67 $entityFactory, 68 $helperFactory, 69 $cachingStrategyFactory 70 ) { 71 $this->filePath = $filePath; 72 $this->tempFolder = $tempFolder; 73 $this->workbookRelationshipsManager = $workbookRelationshipsManager; 74 $this->entityFactory = $entityFactory; 75 $this->helperFactory = $helperFactory; 76 $this->cachingStrategyFactory = $cachingStrategyFactory; 77 } 78 79 /** 80 * Returns whether the XLSX file contains a shared strings XML file 81 * 82 * @return bool 83 */ 84 public function hasSharedStrings() 85 { 86 return $this->workbookRelationshipsManager->hasSharedStringsXMLFile(); 87 } 88 89 /** 90 * Builds an in-memory array containing all the shared strings of the sheet. 91 * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. 92 * It is then accessed by the sheet data, via the string index in the built table. 93 * 94 * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx 95 * 96 * The XML file can be really big with sheets containing a lot of data. That is why 97 * we need to use a XML reader that provides streaming like the XMLReader library. 98 * 99 * @throws \Box\Spout\Common\Exception\IOException If shared strings XML file can't be read 100 * @return void 101 */ 102 public function extractSharedStrings() 103 { 104 $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath(); 105 $xmlReader = $this->entityFactory->createXMLReader(); 106 $sharedStringIndex = 0; 107 108 if ($xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath) === false) { 109 throw new IOException('Could not open "' . $sharedStringsXMLFilePath . '".'); 110 } 111 112 try { 113 $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); 114 $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); 115 116 $xmlReader->readUntilNodeFound(self::XML_NODE_SI); 117 118 while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SI) { 119 $this->processSharedStringsItem($xmlReader, $sharedStringIndex); 120 $sharedStringIndex++; 121 122 // jump to the next '<si>' tag 123 $xmlReader->next(self::XML_NODE_SI); 124 } 125 126 $this->cachingStrategy->closeCache(); 127 } catch (XMLProcessingException $exception) { 128 throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); 129 } 130 131 $xmlReader->close(); 132 } 133 134 /** 135 * Returns the shared strings unique count, as specified in <sst> tag. 136 * 137 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance 138 * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read 139 * @return int|null Number of unique shared strings in the sharedStrings.xml file 140 */ 141 protected function getSharedStringsUniqueCount($xmlReader) 142 { 143 $xmlReader->next(self::XML_NODE_SST); 144 145 // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) 146 while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) { 147 $xmlReader->read(); 148 } 149 150 $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT); 151 152 // some software do not add the "uniqueCount" attribute but only use the "count" one 153 // @see https://github.com/box/spout/issues/254 154 if ($uniqueCount === null) { 155 $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT); 156 } 157 158 return ($uniqueCount !== null) ? (int) $uniqueCount : null; 159 } 160 161 /** 162 * Returns the best shared strings caching strategy. 163 * 164 * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown) 165 * @return CachingStrategyInterface 166 */ 167 protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount) 168 { 169 return $this->cachingStrategyFactory 170 ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory); 171 } 172 173 /** 174 * Processes the shared strings item XML node which the given XML reader is positioned on. 175 * 176 * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node 177 * @param int $sharedStringIndex Index of the processed shared strings item 178 * @return void 179 */ 180 protected function processSharedStringsItem($xmlReader, $sharedStringIndex) 181 { 182 $sharedStringValue = ''; 183 184 // NOTE: expand() will automatically decode all XML entities of the child nodes 185 $siNode = $xmlReader->expand(); 186 $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T); 187 188 foreach ($textNodes as $textNode) { 189 if ($this->shouldExtractTextNodeValue($textNode)) { 190 $textNodeValue = $textNode->nodeValue; 191 $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); 192 193 $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue); 194 } 195 } 196 197 $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex); 198 } 199 200 /** 201 * Not all text nodes' values must be extracted. 202 * Some text nodes are part of a node describing the pronunciation for instance. 203 * We'll only consider the nodes whose parents are "<si>" or "<r>". 204 * 205 * @param \DOMElement $textNode Text node to check 206 * @return bool Whether the given text node's value must be extracted 207 */ 208 protected function shouldExtractTextNodeValue($textNode) 209 { 210 $parentTagName = $textNode->parentNode->localName; 211 212 return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R); 213 } 214 215 /** 216 * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. 217 * 218 * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved 219 * @return bool Whether whitespace should be preserved 220 */ 221 protected function shouldPreserveWhitespace($textNode) 222 { 223 $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE); 224 225 return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE); 226 } 227 228 /** 229 * Returns the shared string at the given index, using the previously chosen caching strategy. 230 * 231 * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file 232 * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index 233 * @return string The shared string at the given index 234 */ 235 public function getStringAtIndex($sharedStringIndex) 236 { 237 return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); 238 } 239 240 /** 241 * Destroys the cache, freeing memory and removing any created artifacts 242 * 243 * @return void 244 */ 245 public function cleanup() 246 { 247 if ($this->cachingStrategy) { 248 $this->cachingStrategy->clearCache(); 249 } 250 } 251 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body