Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.2.x will end 22 April 2024 (12 months).
  • Bug fixes for security issues in 4.2.x will end 7 October 2024 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.1.x is supported too.

Differences Between: [Versions 402 and 403]

   1  <?php
   2  
   3  declare(strict_types=1);
   4  
   5  namespace OpenSpout\Reader\XLSX;
   6  
   7  use DOMElement;
   8  use OpenSpout\Common\Entity\Cell;
   9  use OpenSpout\Common\Entity\Row;
  10  use OpenSpout\Common\Exception\IOException;
  11  use OpenSpout\Reader\Common\Manager\RowManager;
  12  use OpenSpout\Reader\Common\XMLProcessor;
  13  use OpenSpout\Reader\Exception\InvalidValueException;
  14  use OpenSpout\Reader\RowIteratorInterface;
  15  use OpenSpout\Reader\Wrapper\XMLReader;
  16  use OpenSpout\Reader\XLSX\Helper\CellHelper;
  17  use OpenSpout\Reader\XLSX\Helper\CellValueFormatter;
  18  
  19  final class RowIterator implements RowIteratorInterface
  20  {
  21      /**
  22       * Definition of XML nodes names used to parse data.
  23       */
  24      public const XML_NODE_DIMENSION = 'dimension';
  25      public const XML_NODE_WORKSHEET = 'worksheet';
  26      public const XML_NODE_ROW = 'row';
  27      public const XML_NODE_CELL = 'c';
  28  
  29      /**
  30       * Definition of XML attributes used to parse data.
  31       */
  32      public const XML_ATTRIBUTE_REF = 'ref';
  33      public const XML_ATTRIBUTE_SPANS = 'spans';
  34      public const XML_ATTRIBUTE_ROW_INDEX = 'r';
  35      public const XML_ATTRIBUTE_CELL_INDEX = 'r';
  36  
  37      /** @var string Path of the XLSX file being read */
  38      private string $filePath;
  39  
  40      /** @var string Path of the sheet data XML file as in [Content_Types].xml */
  41      private string $sheetDataXMLFilePath;
  42  
  43      /** @var XMLReader The XMLReader object that will help read sheet's XML data */
  44      private XMLReader $xmlReader;
  45  
  46      /** @var XMLProcessor Helper Object to process XML nodes */
  47      private XMLProcessor $xmlProcessor;
  48  
  49      /** @var Helper\CellValueFormatter Helper to format cell values */
  50      private Helper\CellValueFormatter $cellValueFormatter;
  51  
  52      /** @var RowManager Manages rows */
  53      private RowManager $rowManager;
  54  
  55      /**
  56       * TODO: This variable can be deleted when row indices get preserved.
  57       *
  58       * @var int Number of read rows
  59       */
  60      private int $numReadRows = 0;
  61  
  62      /** @var Row Contains the row currently processed */
  63      private Row $currentlyProcessedRow;
  64  
  65      /** @var null|Row Buffer used to store the current row, while checking if there are more rows to read */
  66      private ?Row $rowBuffer;
  67  
  68      /** @var bool Indicates whether all rows have been read */
  69      private bool $hasReachedEndOfFile = false;
  70  
  71      /** @var int The number of columns the sheet has (0 meaning undefined) */
  72      private int $numColumns = 0;
  73  
  74      /** @var bool Whether empty rows should be returned or skipped */
  75      private bool $shouldPreserveEmptyRows;
  76  
  77      /** @var int Last row index processed (one-based) */
  78      private int $lastRowIndexProcessed = 0;
  79  
  80      /** @var int Row index to be processed next (one-based) */
  81      private int $nextRowIndexToBeProcessed = 0;
  82  
  83      /** @var int Last column index processed (zero-based) */
  84      private int $lastColumnIndexProcessed = -1;
  85  
  86      /**
  87       * @param string             $filePath                Path of the XLSX file being read
  88       * @param string             $sheetDataXMLFilePath    Path of the sheet data XML file as in [Content_Types].xml
  89       * @param bool               $shouldPreserveEmptyRows Whether empty rows should be preserved
  90       * @param XMLReader          $xmlReader               XML Reader
  91       * @param XMLProcessor       $xmlProcessor            Helper to process XML files
  92       * @param CellValueFormatter $cellValueFormatter      Helper to format cell values
  93       * @param RowManager         $rowManager              Manages rows
  94       */
  95      public function __construct(
  96          string $filePath,
  97          string $sheetDataXMLFilePath,
  98          bool $shouldPreserveEmptyRows,
  99          XMLReader $xmlReader,
 100          XMLProcessor $xmlProcessor,
 101          CellValueFormatter $cellValueFormatter,
 102          RowManager $rowManager
 103      ) {
 104          $this->filePath = $filePath;
 105          $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
 106          $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
 107          $this->xmlReader = $xmlReader;
 108          $this->cellValueFormatter = $cellValueFormatter;
 109          $this->rowManager = $rowManager;
 110  
 111          // Register all callbacks to process different nodes when reading the XML file
 112          $this->xmlProcessor = $xmlProcessor;
 113          $this->xmlProcessor->registerCallback(self::XML_NODE_DIMENSION, XMLProcessor::NODE_TYPE_START, [$this, 'processDimensionStartingNode']);
 114          $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']);
 115          $this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']);
 116          $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']);
 117          $this->xmlProcessor->registerCallback(self::XML_NODE_WORKSHEET, XMLProcessor::NODE_TYPE_END, [$this, 'processWorksheetEndingNode']);
 118      }
 119  
 120      /**
 121       * Rewind the Iterator to the first element.
 122       * Initializes the XMLReader object that reads the associated sheet data.
 123       * The XMLReader is configured to be safe from billion laughs attack.
 124       *
 125       * @see http://php.net/manual/en/iterator.rewind.php
 126       *
 127       * @throws \OpenSpout\Common\Exception\IOException If the sheet data XML cannot be read
 128       */
 129      public function rewind(): void
 130      {
 131          $this->xmlReader->close();
 132  
 133          if (false === $this->xmlReader->openFileInZip($this->filePath, $this->sheetDataXMLFilePath)) {
 134              throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\".");
 135          }
 136  
 137          $this->numReadRows = 0;
 138          $this->lastRowIndexProcessed = 0;
 139          $this->nextRowIndexToBeProcessed = 0;
 140          $this->rowBuffer = null;
 141          $this->hasReachedEndOfFile = false;
 142          $this->numColumns = 0;
 143  
 144          $this->next();
 145      }
 146  
 147      /**
 148       * Checks if current position is valid.
 149       *
 150       * @see http://php.net/manual/en/iterator.valid.php
 151       */
 152      public function valid(): bool
 153      {
 154          $valid = !$this->hasReachedEndOfFile;
 155          if (!$valid) {
 156              $this->xmlReader->close();
 157          }
 158  
 159          return $valid;
 160      }
 161  
 162      /**
 163       * Move forward to next element. Reads data describing the next unprocessed row.
 164       *
 165       * @see http://php.net/manual/en/iterator.next.php
 166       *
 167       * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
 168       * @throws \OpenSpout\Common\Exception\IOException                   If unable to read the sheet data XML
 169       */
 170      public function next(): void
 171      {
 172          ++$this->nextRowIndexToBeProcessed;
 173  
 174          if ($this->doesNeedDataForNextRowToBeProcessed()) {
 175              $this->readDataForNextRow();
 176          }
 177      }
 178  
 179      /**
 180       * Return the current element, either an empty row or from the buffer.
 181       *
 182       * @see http://php.net/manual/en/iterator.current.php
 183       */
 184      public function current(): Row
 185      {
 186          $rowToBeProcessed = $this->rowBuffer;
 187  
 188          if ($this->shouldPreserveEmptyRows) {
 189              // when we need to preserve empty rows, we will either return
 190              // an empty row or the last row read. This depends whether the
 191              // index of last row that was read matches the index of the last
 192              // row whose value should be returned.
 193              if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) {
 194                  // return empty row if mismatch between last processed row
 195                  // and the row that needs to be returned
 196                  $rowToBeProcessed = new Row([], null);
 197              }
 198          }
 199  
 200          \assert(null !== $rowToBeProcessed);
 201  
 202          return $rowToBeProcessed;
 203      }
 204  
 205      /**
 206       * Return the key of the current element. Here, the row index.
 207       *
 208       * @see http://php.net/manual/en/iterator.key.php
 209       */
 210      public function key(): int
 211      {
 212          // TODO: This should return $this->nextRowIndexToBeProcessed
 213          //       but to avoid a breaking change, the return value for
 214          //       this function has been kept as the number of rows read.
 215          return $this->shouldPreserveEmptyRows ?
 216                  $this->nextRowIndexToBeProcessed :
 217                  $this->numReadRows;
 218      }
 219  
 220      /**
 221       * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
 222       *
 223       * @return string path of the XML file containing the sheet data,
 224       *                without the leading slash
 225       */
 226      private function normalizeSheetDataXMLFilePath(string $sheetDataXMLFilePath): string
 227      {
 228          return ltrim($sheetDataXMLFilePath, '/');
 229      }
 230  
 231      /**
 232       * Returns whether we need data for the next row to be processed.
 233       * We don't need to read data if:
 234       *   we have already read at least one row
 235       *     AND
 236       *   we need to preserve empty rows
 237       *     AND
 238       *   the last row that was read is not the row that need to be processed
 239       *   (i.e. if we need to return empty rows).
 240       *
 241       * @return bool whether we need data for the next row to be processed
 242       */
 243      private function doesNeedDataForNextRowToBeProcessed(): bool
 244      {
 245          $hasReadAtLeastOneRow = (0 !== $this->lastRowIndexProcessed);
 246  
 247          return
 248              !$hasReadAtLeastOneRow
 249              || !$this->shouldPreserveEmptyRows
 250              || $this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed
 251          ;
 252      }
 253  
 254      /**
 255       * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
 256       * @throws \OpenSpout\Common\Exception\IOException                   If unable to read the sheet data XML
 257       */
 258      private function readDataForNextRow(): void
 259      {
 260          $this->currentlyProcessedRow = new Row([], null);
 261  
 262          $this->xmlProcessor->readUntilStopped();
 263  
 264          $this->rowBuffer = $this->currentlyProcessedRow;
 265      }
 266  
 267      /**
 268       * @param XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node
 269       *
 270       * @return int A return code that indicates what action should the processor take next
 271       */
 272      private function processDimensionStartingNode(XMLReader $xmlReader): int
 273      {
 274          // Read dimensions of the sheet
 275          $dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
 276          \assert(null !== $dimensionRef);
 277          if (1 === preg_match('/[A-Z]+\d+:([A-Z]+\d+)/', $dimensionRef, $matches)) {
 278              $this->numColumns = CellHelper::getColumnIndexFromCellIndex($matches[1]) + 1;
 279          }
 280  
 281          return XMLProcessor::PROCESSING_CONTINUE;
 282      }
 283  
 284      /**
 285       * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node
 286       *
 287       * @return int A return code that indicates what action should the processor take next
 288       */
 289      private function processRowStartingNode(XMLReader $xmlReader): int
 290      {
 291          // Reset index of the last processed column
 292          $this->lastColumnIndexProcessed = -1;
 293  
 294          // Mark the last processed row as the one currently being read
 295          $this->lastRowIndexProcessed = $this->getRowIndex($xmlReader);
 296  
 297          // Read spans info if present
 298          $numberOfColumnsForRow = $this->numColumns;
 299          $spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
 300          if (null !== $spans) {
 301              [, $numberOfColumnsForRow] = explode(':', $spans);
 302              $numberOfColumnsForRow = (int) $numberOfColumnsForRow;
 303          }
 304  
 305          $cells = array_fill(0, $numberOfColumnsForRow, Cell::fromValue(''));
 306          $this->currentlyProcessedRow->setCells($cells);
 307  
 308          return XMLProcessor::PROCESSING_CONTINUE;
 309      }
 310  
 311      /**
 312       * @param XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node
 313       *
 314       * @return int A return code that indicates what action should the processor take next
 315       */
 316      private function processCellStartingNode(XMLReader $xmlReader): int
 317      {
 318          $currentColumnIndex = $this->getColumnIndex($xmlReader);
 319  
 320          // NOTE: expand() will automatically decode all XML entities of the child nodes
 321          /** @var DOMElement $node */
 322          $node = $xmlReader->expand();
 323          $cell = $this->getCell($node);
 324  
 325          $this->currentlyProcessedRow->setCellAtIndex($cell, $currentColumnIndex);
 326          $this->lastColumnIndexProcessed = $currentColumnIndex;
 327  
 328          return XMLProcessor::PROCESSING_CONTINUE;
 329      }
 330  
 331      /**
 332       * @return int A return code that indicates what action should the processor take next
 333       */
 334      private function processRowEndingNode(): int
 335      {
 336          // if the fetched row is empty and we don't want to preserve it..,
 337          if (!$this->shouldPreserveEmptyRows && $this->currentlyProcessedRow->isEmpty()) {
 338              // ... skip it
 339              return XMLProcessor::PROCESSING_CONTINUE;
 340          }
 341  
 342          ++$this->numReadRows;
 343  
 344          // If needed, we fill the empty cells
 345          if (0 === $this->numColumns) {
 346              $this->rowManager->fillMissingIndexesWithEmptyCells($this->currentlyProcessedRow);
 347          }
 348  
 349          // at this point, we have all the data we need for the row
 350          // so that we can populate the buffer
 351          return XMLProcessor::PROCESSING_STOP;
 352      }
 353  
 354      /**
 355       * @return int A return code that indicates what action should the processor take next
 356       */
 357      private function processWorksheetEndingNode(): int
 358      {
 359          // The closing "</worksheet>" marks the end of the file
 360          $this->hasReachedEndOfFile = true;
 361  
 362          return XMLProcessor::PROCESSING_STOP;
 363      }
 364  
 365      /**
 366       * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" node
 367       *
 368       * @return int Row index
 369       *
 370       *@throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid
 371       */
 372      private function getRowIndex(XMLReader $xmlReader): int
 373      {
 374          // Get "r" attribute if present (from something like <row r="3"...>
 375          $currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX);
 376  
 377          return (null !== $currentRowIndex) ?
 378                  (int) $currentRowIndex :
 379                  $this->lastRowIndexProcessed + 1;
 380      }
 381  
 382      /**
 383       * @param XMLReader $xmlReader XMLReader object, positioned on a "<c>" node
 384       *
 385       * @return int Column index
 386       *
 387       *@throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid
 388       */
 389      private function getColumnIndex(XMLReader $xmlReader): int
 390      {
 391          // Get "r" attribute if present (from something like <c r="A1"...>
 392          $currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
 393  
 394          return (null !== $currentCellIndex) ?
 395                  CellHelper::getColumnIndexFromCellIndex($currentCellIndex) :
 396                  $this->lastColumnIndexProcessed + 1;
 397      }
 398  
 399      /**
 400       * Returns the cell with (unescaped) correctly marshalled, cell value associated to the given XML node.
 401       *
 402       * @return Cell The cell set with the associated with the cell
 403       */
 404      private function getCell(DOMElement $node): Cell
 405      {
 406          try {
 407              $cellValue = $this->cellValueFormatter->extractAndFormatNodeValue($node);
 408              $cell = Cell::fromValue($cellValue);
 409          } catch (InvalidValueException $exception) {
 410              $cell = new Cell\ErrorCell($exception->getInvalidValue(), null);
 411          }
 412  
 413          return $cell;
 414      }
 415  }