Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.2.x will end 22 April 2024 (12 months).
  • Bug fixes for security issues in 4.2.x will end 7 October 2024 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.1.x is supported too.

Differences Between: [Versions 402 and 403]

   1  <?php
   2  
   3  declare(strict_types=1);
   4  
   5  namespace OpenSpout\Reader\XLSX\Manager;
   6  
   7  use DOMElement;
   8  use OpenSpout\Common\Exception\IOException;
   9  use OpenSpout\Reader\Exception\XMLProcessingException;
  10  use OpenSpout\Reader\Wrapper\XMLReader;
  11  use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
  12  use OpenSpout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
  13  use OpenSpout\Reader\XLSX\Options;
  14  
  15  /**
  16   * @internal
  17   */
  18  final class SharedStringsManager
  19  {
  20      /**
  21       * Definition of XML nodes names used to parse data.
  22       */
  23      public const XML_NODE_SST = 'sst';
  24      public const XML_NODE_SI = 'si';
  25      public const XML_NODE_R = 'r';
  26      public const XML_NODE_T = 't';
  27  
  28      /**
  29       * Definition of XML attributes used to parse data.
  30       */
  31      public const XML_ATTRIBUTE_COUNT = 'count';
  32      public const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
  33      public const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
  34      public const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
  35  
  36      /** @var string Path of the XLSX file being read */
  37      private string $filePath;
  38  
  39      private Options $options;
  40  
  41      /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
  42      private WorkbookRelationshipsManager $workbookRelationshipsManager;
  43  
  44      /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
  45      private CachingStrategyFactory $cachingStrategyFactory;
  46  
  47      /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
  48      private CachingStrategyInterface $cachingStrategy;
  49  
  50      public function __construct(
  51          string $filePath,
  52          Options $options,
  53          WorkbookRelationshipsManager $workbookRelationshipsManager,
  54          CachingStrategyFactory $cachingStrategyFactory
  55      ) {
  56          $this->filePath = $filePath;
  57          $this->options = $options;
  58          $this->workbookRelationshipsManager = $workbookRelationshipsManager;
  59          $this->cachingStrategyFactory = $cachingStrategyFactory;
  60      }
  61  
  62      /**
  63       * Returns whether the XLSX file contains a shared strings XML file.
  64       */
  65      public function hasSharedStrings(): bool
  66      {
  67          return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
  68      }
  69  
  70      /**
  71       * Builds an in-memory array containing all the shared strings of the sheet.
  72       * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
  73       * It is then accessed by the sheet data, via the string index in the built table.
  74       *
  75       * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
  76       *
  77       * The XML file can be really big with sheets containing a lot of data. That is why
  78       * we need to use a XML reader that provides streaming like the XMLReader library.
  79       *
  80       * @throws \OpenSpout\Common\Exception\IOException If shared strings XML file can't be read
  81       */
  82      public function extractSharedStrings(): void
  83      {
  84          $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
  85          $xmlReader = new XMLReader();
  86          $sharedStringIndex = 0;
  87  
  88          if (false === $xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath)) {
  89              throw new IOException('Could not open "'.$sharedStringsXMLFilePath.'".');
  90          }
  91  
  92          try {
  93              $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
  94              $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
  95  
  96              $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
  97  
  98              while (self::XML_NODE_SI === $xmlReader->getCurrentNodeName()) {
  99                  $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
 100                  ++$sharedStringIndex;
 101  
 102                  // jump to the next '<si>' tag
 103                  $xmlReader->next(self::XML_NODE_SI);
 104              }
 105  
 106              $this->cachingStrategy->closeCache();
 107          } catch (XMLProcessingException $exception) {
 108              throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
 109          }
 110  
 111          $xmlReader->close();
 112      }
 113  
 114      /**
 115       * Returns the shared string at the given index, using the previously chosen caching strategy.
 116       *
 117       * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
 118       *
 119       * @return string The shared string at the given index
 120       *
 121       * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
 122       */
 123      public function getStringAtIndex(int $sharedStringIndex): string
 124      {
 125          return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
 126      }
 127  
 128      /**
 129       * Destroys the cache, freeing memory and removing any created artifacts.
 130       */
 131      public function cleanup(): void
 132      {
 133          if (isset($this->cachingStrategy)) {
 134              $this->cachingStrategy->clearCache();
 135          }
 136      }
 137  
 138      /**
 139       * Returns the shared strings unique count, as specified in <sst> tag.
 140       *
 141       * @param XMLReader $xmlReader XMLReader instance
 142       *
 143       * @return null|int Number of unique shared strings in the sharedStrings.xml file
 144       *
 145       * @throws \OpenSpout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
 146       */
 147      private function getSharedStringsUniqueCount(XMLReader $xmlReader): ?int
 148      {
 149          $xmlReader->next(self::XML_NODE_SST);
 150  
 151          // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
 152          while (self::XML_NODE_SST === $xmlReader->getCurrentNodeName() && XMLReader::ELEMENT !== $xmlReader->nodeType) {
 153              $xmlReader->read();
 154          }
 155  
 156          $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
 157  
 158          // some software do not add the "uniqueCount" attribute but only use the "count" one
 159          // @see https://github.com/box/spout/issues/254
 160          if (null === $uniqueCount) {
 161              $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
 162          }
 163  
 164          return (null !== $uniqueCount) ? (int) $uniqueCount : null;
 165      }
 166  
 167      /**
 168       * Returns the best shared strings caching strategy.
 169       *
 170       * @param null|int $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
 171       */
 172      private function getBestSharedStringsCachingStrategy(?int $sharedStringsUniqueCount): CachingStrategyInterface
 173      {
 174          return $this->cachingStrategyFactory
 175              ->createBestCachingStrategy($sharedStringsUniqueCount, $this->options->getTempFolder())
 176          ;
 177      }
 178  
 179      /**
 180       * Processes the shared strings item XML node which the given XML reader is positioned on.
 181       *
 182       * @param XMLReader $xmlReader         XML Reader positioned on a "<si>" node
 183       * @param int       $sharedStringIndex Index of the processed shared strings item
 184       */
 185      private function processSharedStringsItem(XMLReader $xmlReader, int $sharedStringIndex): void
 186      {
 187          $sharedStringValue = '';
 188  
 189          // NOTE: expand() will automatically decode all XML entities of the child nodes
 190          $siNode = $xmlReader->expand();
 191          \assert($siNode instanceof DOMElement);
 192          $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
 193  
 194          foreach ($textNodes as $textNode) {
 195              if ($this->shouldExtractTextNodeValue($textNode)) {
 196                  $textNodeValue = $textNode->nodeValue;
 197                  \assert(null !== $textNodeValue);
 198                  $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
 199  
 200                  $sharedStringValue .= $shouldPreserveWhitespace
 201                      ? $textNodeValue
 202                      : trim($textNodeValue)
 203                  ;
 204              }
 205          }
 206  
 207          $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
 208      }
 209  
 210      /**
 211       * Not all text nodes' values must be extracted.
 212       * Some text nodes are part of a node describing the pronunciation for instance.
 213       * We'll only consider the nodes whose parents are "<si>" or "<r>".
 214       *
 215       * @param DOMElement $textNode Text node to check
 216       *
 217       * @return bool Whether the given text node's value must be extracted
 218       */
 219      private function shouldExtractTextNodeValue(DOMElement $textNode): bool
 220      {
 221          $parentNode = $textNode->parentNode;
 222          \assert(null !== $parentNode);
 223          $parentTagName = $parentNode->localName;
 224  
 225          return self::XML_NODE_SI === $parentTagName || self::XML_NODE_R === $parentTagName;
 226      }
 227  
 228      /**
 229       * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
 230       *
 231       * @param DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
 232       *
 233       * @return bool Whether whitespace should be preserved
 234       */
 235      private function shouldPreserveWhitespace(DOMElement $textNode): bool
 236      {
 237          $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
 238  
 239          return self::XML_ATTRIBUTE_VALUE_PRESERVE === $spaceValue;
 240      }
 241  }