<?php
declare(strict_types=1);
namespace OpenSpout\Reader\XLSX\Manager;
use DOMElement;
use OpenSpout\Common\Exception\IOException;
use OpenSpout\Reader\Exception\XMLProcessingException;
use OpenSpout\Reader\Wrapper\XMLReader;
< use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
> use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactoryInterface;
use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
use OpenSpout\Reader\XLSX\Options;
/**
* @internal
*/
final class SharedStringsManager
{
/**
* Definition of XML nodes names used to parse data.
*/
public const XML_NODE_SST = 'sst';
public const XML_NODE_SI = 'si';
public const XML_NODE_R = 'r';
public const XML_NODE_T = 't';
/**
* Definition of XML attributes used to parse data.
*/
public const XML_ATTRIBUTE_COUNT = 'count';
public const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
public const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
public const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
/** @var string Path of the XLSX file being read */
private string $filePath;
private Options $options;
/** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
private WorkbookRelationshipsManager $workbookRelationshipsManager;
< /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
< private CachingStrategyFactory $cachingStrategyFactory;
> /** @var CachingStrategyFactoryInterface Factory to create shared strings caching strategies */
> private CachingStrategyFactoryInterface $cachingStrategyFactory;
/** @var CachingStrategyInterface The best caching strategy for storing shared strings */
private CachingStrategyInterface $cachingStrategy;
public function __construct(
string $filePath,
Options $options,
WorkbookRelationshipsManager $workbookRelationshipsManager,
< CachingStrategyFactory $cachingStrategyFactory
> CachingStrategyFactoryInterface $cachingStrategyFactory
) {
$this->filePath = $filePath;
$this->options = $options;
$this->workbookRelationshipsManager = $workbookRelationshipsManager;
$this->cachingStrategyFactory = $cachingStrategyFactory;
}
/**
* Returns whether the XLSX file contains a shared strings XML file.
*/
public function hasSharedStrings(): bool
{
return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
}
/**
* Builds an in-memory array containing all the shared strings of the sheet.
* All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
* It is then accessed by the sheet data, via the string index in the built table.
*
* More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
*
* The XML file can be really big with sheets containing a lot of data. That is why
* we need to use a XML reader that provides streaming like the XMLReader library.
*
* @throws \OpenSpout\Common\Exception\IOException If shared strings XML file can't be read
*/
public function extractSharedStrings(): void
{
$sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
$xmlReader = new XMLReader();
$sharedStringIndex = 0;
if (false === $xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath)) {
throw new IOException('Could not open "'.$sharedStringsXMLFilePath.'".');
}
try {
$sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
$this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
$xmlReader->readUntilNodeFound(self::XML_NODE_SI);
while (self::XML_NODE_SI === $xmlReader->getCurrentNodeName()) {
$this->processSharedStringsItem($xmlReader, $sharedStringIndex);
++$sharedStringIndex;
// jump to the next '<si>' tag
$xmlReader->next(self::XML_NODE_SI);
}
$this->cachingStrategy->closeCache();
} catch (XMLProcessingException $exception) {
throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
}
$xmlReader->close();
}
/**
* Returns the shared string at the given index, using the previously chosen caching strategy.
*
* @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
*
* @return string The shared string at the given index
*
* @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
*/
public function getStringAtIndex(int $sharedStringIndex): string
{
return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
}
/**
* Destroys the cache, freeing memory and removing any created artifacts.
*/
public function cleanup(): void
{
if (isset($this->cachingStrategy)) {
$this->cachingStrategy->clearCache();
}
}
/**
* Returns the shared strings unique count, as specified in <sst> tag.
*
* @param XMLReader $xmlReader XMLReader instance
*
* @return null|int Number of unique shared strings in the sharedStrings.xml file
*
* @throws \OpenSpout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
*/
private function getSharedStringsUniqueCount(XMLReader $xmlReader): ?int
{
$xmlReader->next(self::XML_NODE_SST);
// Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
while (self::XML_NODE_SST === $xmlReader->getCurrentNodeName() && XMLReader::ELEMENT !== $xmlReader->nodeType) {
$xmlReader->read();
}
$uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
// some software do not add the "uniqueCount" attribute but only use the "count" one
// @see https://github.com/box/spout/issues/254
if (null === $uniqueCount) {
$uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
}
return (null !== $uniqueCount) ? (int) $uniqueCount : null;
}
/**
* Returns the best shared strings caching strategy.
*
* @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
*/
private function getBestSharedStringsCachingStrategy(?int $sharedStringsUniqueCount): CachingStrategyInterface
{
return $this->cachingStrategyFactory
->createBestCachingStrategy($sharedStringsUniqueCount, $this->options->getTempFolder())
;
}
/**
* Processes the shared strings item XML node which the given XML reader is positioned on.
*
* @param XMLReader $xmlReader XML Reader positioned on a "<si>" node
* @param int $sharedStringIndex Index of the processed shared strings item
*/
private function processSharedStringsItem(XMLReader $xmlReader, int $sharedStringIndex): void
{
$sharedStringValue = '';
// NOTE: expand() will automatically decode all XML entities of the child nodes
$siNode = $xmlReader->expand();
\assert($siNode instanceof DOMElement);
$textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
foreach ($textNodes as $textNode) {
if ($this->shouldExtractTextNodeValue($textNode)) {
$textNodeValue = $textNode->nodeValue;
\assert(null !== $textNodeValue);
$shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
$sharedStringValue .= $shouldPreserveWhitespace
? $textNodeValue
< : trim($textNodeValue)
< ;
> : trim($textNodeValue);
}
}
$this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
}
/**
* Not all text nodes' values must be extracted.
* Some text nodes are part of a node describing the pronunciation for instance.
* We'll only consider the nodes whose parents are "<si>" or "<r>".
*
* @param DOMElement $textNode Text node to check
*
* @return bool Whether the given text node's value must be extracted
*/
private function shouldExtractTextNodeValue(DOMElement $textNode): bool
{
$parentNode = $textNode->parentNode;
\assert(null !== $parentNode);
$parentTagName = $parentNode->localName;
return self::XML_NODE_SI === $parentTagName || self::XML_NODE_R === $parentTagName;
}
/**
* If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
*
* @param DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
*
* @return bool Whether whitespace should be preserved
*/
private function shouldPreserveWhitespace(DOMElement $textNode): bool
{
$spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
return self::XML_ATTRIBUTE_VALUE_PRESERVE === $spaceValue;
}
}