Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 3.10.x will end 8 November 2021 (12 months).
  • Bug fixes for security issues in 3.10.x will end 9 May 2022 (18 months).
  • PHP version: minimum PHP 7.2.0 Note: minimum PHP version has increased since Moodle 3.8. PHP 7.3.x and 7.4.x are supported too.

Differences Between: [Versions 310 and 311] [Versions 310 and 400] [Versions 310 and 401]

   1  <?php
   2  
   3  namespace Box\Spout\Reader\XLSX\Manager;
   4  
   5  use Box\Spout\Common\Exception\IOException;
   6  use Box\Spout\Reader\Exception\XMLProcessingException;
   7  use Box\Spout\Reader\Wrapper\XMLReader;
   8  use Box\Spout\Reader\XLSX\Creator\HelperFactory;
   9  use Box\Spout\Reader\XLSX\Creator\InternalEntityFactory;
  10  use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyFactory;
  11  use Box\Spout\Reader\XLSX\Manager\SharedStringsCaching\CachingStrategyInterface;
  12  
  13  /**
  14   * Class SharedStringsManager
  15   * This class manages the shared strings defined in the associated XML file
  16   */
  17  class SharedStringsManager
  18  {
  19      /** Main namespace for the sharedStrings.xml file */
  20      const MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main';
  21  
  22      /** Definition of XML nodes names used to parse data */
  23      const XML_NODE_SST = 'sst';
  24      const XML_NODE_SI = 'si';
  25      const XML_NODE_R = 'r';
  26      const XML_NODE_T = 't';
  27  
  28      /** Definition of XML attributes used to parse data */
  29      const XML_ATTRIBUTE_COUNT = 'count';
  30      const XML_ATTRIBUTE_UNIQUE_COUNT = 'uniqueCount';
  31      const XML_ATTRIBUTE_XML_SPACE = 'xml:space';
  32      const XML_ATTRIBUTE_VALUE_PRESERVE = 'preserve';
  33  
  34      /** @var string Path of the XLSX file being read */
  35      protected $filePath;
  36  
  37      /** @var string Temporary folder where the temporary files to store shared strings will be stored */
  38      protected $tempFolder;
  39  
  40      /** @var WorkbookRelationshipsManager Helps retrieving workbook relationships */
  41      protected $workbookRelationshipsManager;
  42  
  43      /** @var InternalEntityFactory Factory to create entities */
  44      protected $entityFactory;
  45  
  46      /** @var HelperFactory $helperFactory Factory to create helpers */
  47      protected $helperFactory;
  48  
  49      /** @var CachingStrategyFactory Factory to create shared strings caching strategies */
  50      protected $cachingStrategyFactory;
  51  
  52      /** @var CachingStrategyInterface The best caching strategy for storing shared strings */
  53      protected $cachingStrategy;
  54  
  55      /**
  56       * @param string $filePath Path of the XLSX file being read
  57       * @param string $tempFolder Temporary folder where the temporary files to store shared strings will be stored
  58       * @param WorkbookRelationshipsManager $workbookRelationshipsManager Helps retrieving workbook relationships
  59       * @param InternalEntityFactory $entityFactory Factory to create entities
  60       * @param HelperFactory $helperFactory Factory to create helpers
  61       * @param CachingStrategyFactory $cachingStrategyFactory Factory to create shared strings caching strategies
  62       */
  63      public function __construct(
  64          $filePath,
  65          $tempFolder,
  66          $workbookRelationshipsManager,
  67          $entityFactory,
  68          $helperFactory,
  69          $cachingStrategyFactory
  70      ) {
  71          $this->filePath = $filePath;
  72          $this->tempFolder = $tempFolder;
  73          $this->workbookRelationshipsManager = $workbookRelationshipsManager;
  74          $this->entityFactory = $entityFactory;
  75          $this->helperFactory = $helperFactory;
  76          $this->cachingStrategyFactory = $cachingStrategyFactory;
  77      }
  78  
  79      /**
  80       * Returns whether the XLSX file contains a shared strings XML file
  81       *
  82       * @return bool
  83       */
  84      public function hasSharedStrings()
  85      {
  86          return $this->workbookRelationshipsManager->hasSharedStringsXMLFile();
  87      }
  88  
  89      /**
  90       * Builds an in-memory array containing all the shared strings of the sheet.
  91       * All the strings are stored in a XML file, located at 'xl/sharedStrings.xml'.
  92       * It is then accessed by the sheet data, via the string index in the built table.
  93       *
  94       * More documentation available here: http://msdn.microsoft.com/en-us/library/office/gg278314.aspx
  95       *
  96       * The XML file can be really big with sheets containing a lot of data. That is why
  97       * we need to use a XML reader that provides streaming like the XMLReader library.
  98       *
  99       * @throws \Box\Spout\Common\Exception\IOException If shared strings XML file can't be read
 100       * @return void
 101       */
 102      public function extractSharedStrings()
 103      {
 104          $sharedStringsXMLFilePath = $this->workbookRelationshipsManager->getSharedStringsXMLFilePath();
 105          $xmlReader = $this->entityFactory->createXMLReader();
 106          $sharedStringIndex = 0;
 107  
 108          if ($xmlReader->openFileInZip($this->filePath, $sharedStringsXMLFilePath) === false) {
 109              throw new IOException('Could not open "' . $sharedStringsXMLFilePath . '".');
 110          }
 111  
 112          try {
 113              $sharedStringsUniqueCount = $this->getSharedStringsUniqueCount($xmlReader);
 114              $this->cachingStrategy = $this->getBestSharedStringsCachingStrategy($sharedStringsUniqueCount);
 115  
 116              $xmlReader->readUntilNodeFound(self::XML_NODE_SI);
 117  
 118              while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SI) {
 119                  $this->processSharedStringsItem($xmlReader, $sharedStringIndex);
 120                  $sharedStringIndex++;
 121  
 122                  // jump to the next '<si>' tag
 123                  $xmlReader->next(self::XML_NODE_SI);
 124              }
 125  
 126              $this->cachingStrategy->closeCache();
 127          } catch (XMLProcessingException $exception) {
 128              throw new IOException("The sharedStrings.xml file is invalid and cannot be read. [{$exception->getMessage()}]");
 129          }
 130  
 131          $xmlReader->close();
 132      }
 133  
 134      /**
 135       * Returns the shared strings unique count, as specified in <sst> tag.
 136       *
 137       * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XMLReader instance
 138       * @throws \Box\Spout\Common\Exception\IOException If sharedStrings.xml is invalid and can't be read
 139       * @return int|null Number of unique shared strings in the sharedStrings.xml file
 140       */
 141      protected function getSharedStringsUniqueCount($xmlReader)
 142      {
 143          $xmlReader->next(self::XML_NODE_SST);
 144  
 145          // Iterate over the "sst" elements to get the actual "sst ELEMENT" (skips any DOCTYPE)
 146          while ($xmlReader->getCurrentNodeName() === self::XML_NODE_SST && $xmlReader->nodeType !== XMLReader::ELEMENT) {
 147              $xmlReader->read();
 148          }
 149  
 150          $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_UNIQUE_COUNT);
 151  
 152          // some software do not add the "uniqueCount" attribute but only use the "count" one
 153          // @see https://github.com/box/spout/issues/254
 154          if ($uniqueCount === null) {
 155              $uniqueCount = $xmlReader->getAttribute(self::XML_ATTRIBUTE_COUNT);
 156          }
 157  
 158          return ($uniqueCount !== null) ? (int) $uniqueCount : null;
 159      }
 160  
 161      /**
 162       * Returns the best shared strings caching strategy.
 163       *
 164       * @param int|null $sharedStringsUniqueCount Number of unique shared strings (NULL if unknown)
 165       * @return CachingStrategyInterface
 166       */
 167      protected function getBestSharedStringsCachingStrategy($sharedStringsUniqueCount)
 168      {
 169          return $this->cachingStrategyFactory
 170                  ->createBestCachingStrategy($sharedStringsUniqueCount, $this->tempFolder, $this->helperFactory);
 171      }
 172  
 173      /**
 174       * Processes the shared strings item XML node which the given XML reader is positioned on.
 175       *
 176       * @param \Box\Spout\Reader\Wrapper\XMLReader $xmlReader XML Reader positioned on a "<si>" node
 177       * @param int $sharedStringIndex Index of the processed shared strings item
 178       * @return void
 179       */
 180      protected function processSharedStringsItem($xmlReader, $sharedStringIndex)
 181      {
 182          $sharedStringValue = '';
 183  
 184          // NOTE: expand() will automatically decode all XML entities of the child nodes
 185          $siNode = $xmlReader->expand();
 186          $textNodes = $siNode->getElementsByTagName(self::XML_NODE_T);
 187  
 188          foreach ($textNodes as $textNode) {
 189              if ($this->shouldExtractTextNodeValue($textNode)) {
 190                  $textNodeValue = $textNode->nodeValue;
 191                  $shouldPreserveWhitespace = $this->shouldPreserveWhitespace($textNode);
 192  
 193                  $sharedStringValue .= ($shouldPreserveWhitespace) ? $textNodeValue : trim($textNodeValue);
 194              }
 195          }
 196  
 197          $this->cachingStrategy->addStringForIndex($sharedStringValue, $sharedStringIndex);
 198      }
 199  
 200      /**
 201       * Not all text nodes' values must be extracted.
 202       * Some text nodes are part of a node describing the pronunciation for instance.
 203       * We'll only consider the nodes whose parents are "<si>" or "<r>".
 204       *
 205       * @param \DOMElement $textNode Text node to check
 206       * @return bool Whether the given text node's value must be extracted
 207       */
 208      protected function shouldExtractTextNodeValue($textNode)
 209      {
 210          $parentTagName = $textNode->parentNode->localName;
 211  
 212          return ($parentTagName === self::XML_NODE_SI || $parentTagName === self::XML_NODE_R);
 213      }
 214  
 215      /**
 216       * If the text node has the attribute 'xml:space="preserve"', then preserve whitespace.
 217       *
 218       * @param \DOMElement $textNode The text node element (<t>) whose whitespace may be preserved
 219       * @return bool Whether whitespace should be preserved
 220       */
 221      protected function shouldPreserveWhitespace($textNode)
 222      {
 223          $spaceValue = $textNode->getAttribute(self::XML_ATTRIBUTE_XML_SPACE);
 224  
 225          return ($spaceValue === self::XML_ATTRIBUTE_VALUE_PRESERVE);
 226      }
 227  
 228      /**
 229       * Returns the shared string at the given index, using the previously chosen caching strategy.
 230       *
 231       * @param int $sharedStringIndex Index of the shared string in the sharedStrings.xml file
 232       * @throws \Box\Spout\Reader\Exception\SharedStringNotFoundException If no shared string found for the given index
 233       * @return string The shared string at the given index
 234       */
 235      public function getStringAtIndex($sharedStringIndex)
 236      {
 237          return $this->cachingStrategy->getStringAtIndex($sharedStringIndex);
 238      }
 239  
 240      /**
 241       * Destroys the cache, freeing memory and removing any created artifacts
 242       *
 243       * @return void
 244       */
 245      public function cleanup()
 246      {
 247          if ($this->cachingStrategy) {
 248              $this->cachingStrategy->clearCache();
 249          }
 250      }
 251  }