Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.3.x will end 7 October 2024 (12 months).
  • Bug fixes for security issues in 4.3.x will end 21 April 2025 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.2.x is supported too.
<?php

declare(strict_types=1);

namespace OpenSpout\Reader\XLSX\Manager;

use DOMElement;
use OpenSpout\Common\Exception\IOException;
use OpenSpout\Reader\Exception\XMLProcessingException;
use OpenSpout\Reader\Wrapper\XMLReader;
< use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
> use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactoryInterface;
use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface; use OpenSpout\Reader\XLSX\Options; /** * @internal */ final class SharedStringsManager { /** * Definition of XML nodes names used to parse data. */ public const XML_NODE_SST = 'sst'; public const XML_NODE_SI = 'si'; public const XML_NODE_R = 'r'; public const XML_NODE_T = 't'; /** * Definition of XML attributes used to parse data. */ public const XML_ATTRIBUTE_COUNT = 'count'; public const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount'; public const XML_ATTRIBUTE_XML_SPACE = 'xml:space'; public const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve'; /** @var string Path of the XLSX file being read */ private string $filePath; private Options $options; /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */ private WorkbookRelationshipsManager $workbookRelationshipsManager;
< /** @var CachingStrategyFactory Factory to create shared strings caching strategies */ < private CachingStrategyFactory $cachingStrategyFactory;
> /** @var CachingStrategyFactoryInterface Factory to create shared strings caching strategies */ > private CachingStrategyFactoryInterface $cachingStrategyFactory;
/** @var CachingStrategyInterface The best caching strategy for storing shared strings */ private CachingStrategyInterface $cachingStrategy; public function __construct( string $filePath, Options $options, WorkbookRelationshipsManager $workbookRelationshipsManager,
< CachingStrategyFactory $cachingStrategyFactory
> CachingStrategyFactoryInterface $cachingStrategyFactory
) { $this->filePath = $filePath; $this->options = $options; $this->workbookRelationshipsManager = $workbookRelationshipsManager; $this->cachingStrategyFactory = $cachingStrategyFactory; } /** * Returns whether the XLSX file contains a shared strings XML file. */ public function hasSharedStrings(): bool { return $this->workbookRelationshipsManager->hasSharedStringsXMLFile(); } /** * Builds an in-memory array containing all the shared strings of the sheet. * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'. * It is then accessed by the sheet data, via the string index in the built table. * * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx * * The XML file can be really big with sheets containing a lot of data. That is why * we need to use a XML reader that provides streaming like the XMLReader library. * * @throws \OpenSpout\Common\Exception\IOException If shared strings XML file can't be read */ public function extractSharedStrings(): void { $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath(); $xmlReader = new XMLReader(); $sharedStringIndex = 0; if (false === $xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath)) { throw new IOException('Could not open "'.$sharedStringsXMLFilePath.'".'); } try { $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader); $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount); $xmlReader->readUntilNodeFound(self::XML_NODE_SI); while (self::XML_NODE_SI === $xmlReader->getCurrentNodeName()) { $this->processSharedStringsItem($xmlReader, $sharedStringIndex); ++$sharedStringIndex; // jump to the next '<si>' tag $xmlReader->next(self::XML_NODE_SI); } $this->cachingStrategy->closeCache(); } catch (XMLProcessingException $exception) { throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]"); } $xmlReader->close(); } /** * Returns the shared string at the given index, using the previously chosen caching strategy. * * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file * * @return string The shared string at the given index * * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index */ public function getStringAtIndex(int $sharedStringIndex): string { return $this->cachingStrategy->getStringAtIndex($sharedStringIndex); } /** * Destroys the cache, freeing memory and removing any created artifacts. */ public function cleanup(): void { if (isset($this->cachingStrategy)) { $this->cachingStrategy->clearCache(); } } /** * Returns the shared strings unique count, as specified in <sst> tag. * * @param XMLReader $xmlReader XMLReader instance * * @return null|int Number of unique shared strings in the sharedStrings.xml file * * @throws \OpenSpout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read */ private function getSharedStringsUniqueCount(XMLReader $xmlReader): ?int { $xmlReader->next(self::XML_NODE_SST); // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE) while (self::XML_NODE_SST === $xmlReader->getCurrentNodeName() && XMLReader::ELEMENT !== $xmlReader->nodeType) { $xmlReader->read(); } $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT); // some software do not add the "uniqueCount" attribute but only use the "count" one // @see https://github.com/box/spout/issues/254 if (null === $uniqueCount) { $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT); } return (null !== $uniqueCount) ? (int) $uniqueCount : null; } /** * Returns the best shared strings caching strategy. * * @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown) */ private function getBestSharedStringsCachingStrategy(?int $sharedStringsUniqueCount): CachingStrategyInterface { return $this->cachingStrategyFactory ->createBestCachingStrategy($sharedStringsUniqueCount, $this->options->getTempFolder()) ; } /** * Processes the shared strings item XML node which the given XML reader is positioned on. * * @param XMLReader $xmlReader XML Reader positioned on a "<si>" node * @param int $sharedStringIndex Index of the processed shared strings item */ private function processSharedStringsItem(XMLReader $xmlReader, int $sharedStringIndex): void { $sharedStringValue = ''; // NOTE: expand() will automatically decode all XML entities of the child nodes $siNode = $xmlReader->expand(); \assert($siNode instanceof DOMElement); $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T); foreach ($textNodes as $textNode) { if ($this->shouldExtractTextNodeValue($textNode)) { $textNodeValue = $textNode->nodeValue; \assert(null !== $textNodeValue); $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode); $sharedStringValue .= $shouldPreserveWhitespace ? $textNodeValue
< : trim($textNodeValue) < ;
> : trim($textNodeValue);
} } $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex); } /** * Not all text nodes' values must be extracted. * Some text nodes are part of a node describing the pronunciation for instance. * We'll only consider the nodes whose parents are "<si>" or "<r>". * * @param DOMElement $textNode Text node to check * * @return bool Whether the given text node's value must be extracted */ private function shouldExtractTextNodeValue(DOMElement $textNode): bool { $parentNode = $textNode->parentNode; \assert(null !== $parentNode); $parentTagName = $parentNode->localName; return self::XML_NODE_SI === $parentTagName || self::XML_NODE_R === $parentTagName; } /** * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace. * * @param DOMElement $textNode The text node element (<t>) whose whitespace may be preserved * * @return bool Whether whitespace should be preserved */ private function shouldPreserveWhitespace(DOMElement $textNode): bool { $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE); return self::XML_ATTRIBUTE_VALUE_PRESERVE === $spaceValue; } }