Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.0.x will end 8 May 2023 (12 months).
  • Bug fixes for security issues in 4.0.x will end 13 November 2023 (18 months).
  • PHP version: minimum PHP 7.3.0 Note: the minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is also supported.

Differences Between: [Versions 310 and 400] [Versions 39 and 400]

   1  <?php
   2  
   3  namespace Box\Spout\Reader\XLSX\Manager;
   4  
   5  use Box\Spout\Common\Exception\IOException;
   6  use Box\Spout\Reader\Exception\XMLProcessingException;
   7  use Box\Spout\Reader\Wrapper\XMLReader;
   8  use Box\Spout\Reader\XLSX\Creator\HelperFactory;
   9  use Box\Spout\Reader\XLSX\Creator\InternalEntityFactory;
  10  use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
  11  use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
  12  
  13  /**
  14   * Class SharedStringsManager
  15   * This class manages the shared strings defined in the associated XML file
  16   */
  17  class SharedStringsManager
  18  {
  19      /** Definition of XML nodes names used to parse data */
  20      const XML_NODE_SST = 'sst';
  21      const XML_NODE_SI = 'si';
  22      const XML_NODE_R = 'r';
  23      const XML_NODE_T = 't';
  24  
  25      /** Definition of XML attributes used to parse data */
  26      const XML_ATTRIBUTE_COUNT = 'count';
  27      const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
  28      const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
  29      const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
  30  
  31      /** @var string Path of the XLSX file being read */
  32      protected $filePath;
  33  
  34      /** @var string Temporary folder where the temporary files to store shared strings will be stored */
  35      protected $tempFolder;
  36  
  37      /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
  38      protected $workbookRelationshipsManager;
  39  
  40      /** @var InternalEntityFactory Factory to create entities */
  41      protected $entityFactory;
  42  
  43      /** @var HelperFactory Factory to create helpers */
  44      protected $helperFactory;
  45  
  46      /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
  47      protected $cachingStrategyFactory;
  48  
  49      /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
  50      protected $cachingStrategy;
  51  
  52      /**
  53       * @param string $filePath Path of the XLSX file being read
  54       * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored
  55       * @param WorkbookRelationshipsManager $workbookRelationshipsManager Helps retrieving workbook relationships
  56       * @param InternalEntityFactory $entityFactory Factory to create entities
  57       * @param HelperFactory $helperFactory Factory to create helpers
  58       * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies
  59       */
  60      public function __construct(
  61          $filePath,
  62          $tempFolder,
  63          $workbookRelationshipsManager,
  64          $entityFactory,
  65          $helperFactory,
  66          $cachingStrategyFactory
  67      ) {
  68          $this->filePath = $filePath;
  69          $this->tempFolder = $tempFolder;
  70          $this->workbookRelationshipsManager = $workbookRelationshipsManager;
  71          $this->entityFactory = $entityFactory;
  72          $this->helperFactory = $helperFactory;
  73          $this->cachingStrategyFactory = $cachingStrategyFactory;
  74      }
  75  
  76      /**
  77       * Returns whether the XLSX file contains a shared strings XML file
  78       *
  79       * @return bool
  80       */
  81      public function hasSharedStrings()
  82      {
  83          return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
  84      }
  85  
  86      /**
  87       * Builds an in-memory array containing all the shared strings of the sheet.
  88       * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
  89       * It is then accessed by the sheet data, via the string index in the built table.
  90       *
  91       * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
  92       *
  93       * The XML file can be really big with sheets containing a lot of data. That is why
  94       * we need to use a XML reader that provides streaming like the XMLReader library.
  95       *
  96       * @throws \Box\Spout\Common\Exception\IOException If shared strings XML file can't be read
  97       * @return void
  98       */
  99      public function extractSharedStrings()
 100      {
 101          $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
 102          $xmlReader = $this->entityFactory->createXMLReader();
 103          $sharedStringIndex = 0;
 104  
 105          if ($xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath) === false) {
 106              throw new IOException('Could not open "' . $sharedStringsXMLFilePath . '".');
 107          }
 108  
 109          try {
 110              $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
 111              $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
 112  
 113              $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
 114  
 115              while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SI) {
 116                  $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
 117                  $sharedStringIndex++;
 118  
 119                  // jump to the next '<si>' tag
 120                  $xmlReader->next(self::XML_NODE_SI);
 121              }
 122  
 123              $this->cachingStrategy->closeCache();
 124          } catch (XMLProcessingException $exception) {
 125              throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
 126          }
 127  
 128          $xmlReader->close();
 129      }
 130  
 131      /**
 132       * Returns the shared strings unique count, as specified in <sst> tag.
 133       *
 134       * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
 135       * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
 136       * @return int|null Number of unique shared strings in the sharedStrings.xml file
 137       */
 138      protected function getSharedStringsUniqueCount($xmlReader)
 139      {
 140          $xmlReader->next(self::XML_NODE_SST);
 141  
 142          // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
 143          while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
 144              $xmlReader->read();
 145          }
 146  
 147          $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
 148  
 149          // some software do not add the "uniqueCount" attribute but only use the "count" one
 150          // @see https://github.com/box/spout/issues/254
 151          if ($uniqueCount === null) {
 152              $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
 153          }
 154  
 155          return ($uniqueCount !== null) ? (int) $uniqueCount : null;
 156      }
 157  
 158      /**
 159       * Returns the best shared strings caching strategy.
 160       *
 161       * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
 162       * @return CachingStrategyInterface
 163       */
 164      protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
 165      {
 166          return $this->cachingStrategyFactory
 167                  ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory);
 168      }
 169  
 170      /**
 171       * Processes the shared strings item XML node which the given XML reader is positioned on.
 172       *
 173       * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
 174       * @param int $sharedStringIndex Index of the processed shared strings item
 175       * @return void
 176       */
 177      protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
 178      {
 179          $sharedStringValue = '';
 180  
 181          // NOTE: expand() will automatically decode all XML entities of the child nodes
 182          $siNode = $xmlReader->expand();
 183          $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
 184  
 185          foreach ($textNodes as $textNode) {
 186              if ($this->shouldExtractTextNodeValue($textNode)) {
 187                  $textNodeValue = $textNode->nodeValue;
 188                  $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
 189  
 190                  $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : \trim($textNodeValue);
 191              }
 192          }
 193  
 194          $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
 195      }
 196  
 197      /**
 198       * Not all text nodes' values must be extracted.
 199       * Some text nodes are part of a node describing the pronunciation for instance.
 200       * We'll only consider the nodes whose parents are "<si>" or "<r>".
 201       *
 202       * @param \DOMElement $textNode Text node to check
 203       * @return bool Whether the given text node's value must be extracted
 204       */
 205      protected function shouldExtractTextNodeValue($textNode)
 206      {
 207          $parentTagName = $textNode->parentNode->localName;
 208  
 209          return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
 210      }
 211  
 212      /**
 213       * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
 214       *
 215       * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
 216       * @return bool Whether whitespace should be preserved
 217       */
 218      protected function shouldPreserveWhitespace($textNode)
 219      {
 220          $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
 221  
 222          return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
 223      }
 224  
 225      /**
 226       * Returns the shared string at the given index, using the previously chosen caching strategy.
 227       *
 228       * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
 229       * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
 230       * @return string The shared string at the given index
 231       */
 232      public function getStringAtIndex($sharedStringIndex)
 233      {
 234          return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
 235      }
 236  
 237      /**
 238       * Destroys the cache, freeing memory and removing any created artifacts
 239       *
 240       * @return void
 241       */
 242      public function cleanup()
 243      {
 244          if ($this->cachingStrategy) {
 245              $this->cachingStrategy->clearCache();
 246          }
 247      }
 248  }