Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.3.x will end 7 October 2024 (12 months).
  • Bug fixes for security issues in 4.3.x will end 21 April 2025 (18 months).
  • PHP version: minimum PHP 8.0.0 Note: minimum PHP version has increased since Moodle 4.1. PHP 8.2.x is supported too.

Differences Between: [Versions 402 and 403]

   1  <?php
   2  
   3  declare(strict_types=1);
   4  
   5  namespace OpenSpout\Reader\XLSX;
   6  
   7  use DOMElement;
   8  use OpenSpout\Common\Entity\Cell;
   9  use OpenSpout\Common\Entity\Row;
  10  use OpenSpout\Common\Exception\IOException;
  11  use OpenSpout\Reader\Common\Manager\RowManager;
  12  use OpenSpout\Reader\Common\XMLProcessor;
  13  use OpenSpout\Reader\Exception\InvalidValueException;
  14  use OpenSpout\Reader\RowIteratorInterface;
  15  use OpenSpout\Reader\Wrapper\XMLReader;
  16  use OpenSpout\Reader\XLSX\Helper\CellHelper;
  17  use OpenSpout\Reader\XLSX\Helper\CellValueFormatter;
  18  
  19  final class RowIterator implements RowIteratorInterface
  20  {
  21      /**
  22       * Definition of XML nodes names used to parse data.
  23       */
  24      public const XML_NODE_DIMENSION = 'dimension';
  25      public const XML_NODE_WORKSHEET = 'worksheet';
  26      public const XML_NODE_ROW = 'row';
  27      public const XML_NODE_CELL = 'c';
  28  
  29      /**
  30       * Definition of XML attributes used to parse data.
  31       */
  32      public const XML_ATTRIBUTE_REF = 'ref';
  33      public const XML_ATTRIBUTE_SPANS = 'spans';
  34      public const XML_ATTRIBUTE_ROW_INDEX = 'r';
  35      public const XML_ATTRIBUTE_CELL_INDEX = 'r';
  36  
  37      /** @var string Path of the XLSX file being read */
  38      private string $filePath;
  39  
  40      /** @var string Path of the sheet data XML file as in [Content_Types].xml */
  41      private string $sheetDataXMLFilePath;
  42  
  43      /** @var XMLReader The XMLReader object that will help read sheet's XML data */
  44      private XMLReader $xmlReader;
  45  
  46      /** @var XMLProcessor Helper Object to process XML nodes */
  47      private XMLProcessor $xmlProcessor;
  48  
  49      /** @var Helper\CellValueFormatter Helper to format cell values */
  50      private Helper\CellValueFormatter $cellValueFormatter;
  51  
  52      /** @var RowManager Manages rows */
  53      private RowManager $rowManager;
  54  
  55      /**
  56       * TODO: This variable can be deleted when row indices get preserved.
  57       *
  58       * @var int Number of read rows
  59       */
  60      private int $numReadRows = 0;
  61  
  62      /** @var Row Contains the row currently processed */
  63      private Row $currentlyProcessedRow;
  64  
  65      /** @var null|Row Buffer used to store the current row, while checking if there are more rows to read */
  66      private ?Row $rowBuffer;
  67  
  68      /** @var bool Indicates whether all rows have been read */
  69      private bool $hasReachedEndOfFile = false;
  70  
  71      /** @var int The number of columns the sheet has (0 meaning undefined) */
  72      private int $numColumns = 0;
  73  
  74      /** @var bool Whether empty rows should be returned or skipped */
  75      private bool $shouldPreserveEmptyRows;
  76  
  77      /** @var int Last row index processed (one-based) */
  78      private int $lastRowIndexProcessed = 0;
  79  
  80      /** @var int Row index to be processed next (one-based) */
  81      private int $nextRowIndexToBeProcessed = 0;
  82  
  83      /** @var int Last column index processed (zero-based) */
  84      private int $lastColumnIndexProcessed = -1;
  85  
  86      /**
  87       * @param string             $filePath                Path of the XLSX file being read
  88       * @param string             $sheetDataXMLFilePath    Path of the sheet data XML file as in [Content_Types].xml
  89       * @param bool               $shouldPreserveEmptyRows Whether empty rows should be preserved
  90       * @param XMLReader          $xmlReader               XML Reader
  91       * @param XMLProcessor       $xmlProcessor            Helper to process XML files
  92       * @param CellValueFormatter $cellValueFormatter      Helper to format cell values
  93       * @param RowManager         $rowManager              Manages rows
  94       */
  95      public function __construct(
  96          string $filePath,
  97          string $sheetDataXMLFilePath,
  98          bool $shouldPreserveEmptyRows,
  99          XMLReader $xmlReader,
 100          XMLProcessor $xmlProcessor,
 101          CellValueFormatter $cellValueFormatter,
 102          RowManager $rowManager
 103      ) {
 104          $this->filePath = $filePath;
 105          $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath);
 106          $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows;
 107          $this->xmlReader = $xmlReader;
 108          $this->cellValueFormatter = $cellValueFormatter;
 109          $this->rowManager = $rowManager;
 110  
 111          // Register all callbacks to process different nodes when reading the XML file
 112          $this->xmlProcessor = $xmlProcessor;
 113          $this->xmlProcessor->registerCallback(self::XML_NODE_DIMENSION, XMLProcessor::NODE_TYPE_START, [$this, 'processDimensionStartingNode']);
 114          $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']);
 115          $this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']);
 116          $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']);
 117          $this->xmlProcessor->registerCallback(self::XML_NODE_WORKSHEET, XMLProcessor::NODE_TYPE_END, [$this, 'processWorksheetEndingNode']);
 118      }
 119  
 120      /**
 121       * Rewind the Iterator to the first element.
 122       * Initializes the XMLReader object that reads the associated sheet data.
 123       * The XMLReader is configured to be safe from billion laughs attack.
 124       *
 125       * @see http://php.net/manual/en/iterator.rewind.php
 126       *
 127       * @throws \OpenSpout\Common\Exception\IOException If the sheet data XML cannot be read
 128       */
 129      public function rewind(): void
 130      {
 131          $this->xmlReader->close();
 132  
 133          if (false === $this->xmlReader->openFileInZip($this->filePath, $this->sheetDataXMLFilePath)) {
 134              throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\".");
 135          }
 136  
 137          $this->numReadRows = 0;
 138          $this->lastRowIndexProcessed = 0;
 139          $this->nextRowIndexToBeProcessed = 0;
 140          $this->rowBuffer = null;
 141          $this->hasReachedEndOfFile = false;
 142          $this->numColumns = 0;
 143  
 144          $this->next();
 145      }
 146  
 147      /**
 148       * Checks if current position is valid.
 149       *
 150       * @see http://php.net/manual/en/iterator.valid.php
 151       */
 152      public function valid(): bool
 153      {
 154          $valid = !$this->hasReachedEndOfFile;
 155          if (!$valid) {
 156              $this->xmlReader->close();
 157          }
 158  
 159          return $valid;
 160      }
 161  
 162      /**
 163       * Move forward to next element. Reads data describing the next unprocessed row.
 164       *
 165       * @see http://php.net/manual/en/iterator.next.php
 166       *
 167       * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
 168       * @throws \OpenSpout\Common\Exception\IOException                   If unable to read the sheet data XML
 169       */
 170      public function next(): void
 171      {
 172          ++$this->nextRowIndexToBeProcessed;
 173  
 174          if ($this->doesNeedDataForNextRowToBeProcessed()) {
 175              $this->readDataForNextRow();
 176          }
 177      }
 178  
 179      /**
 180       * Return the current element, either an empty row or from the buffer.
 181       *
 182       * @see http://php.net/manual/en/iterator.current.php
 183       */
 184      public function current(): Row
 185      {
 186          $rowToBeProcessed = $this->rowBuffer;
 187  
 188          if ($this->shouldPreserveEmptyRows) {
 189              // when we need to preserve empty rows, we will either return
 190              // an empty row or the last row read. This depends whether the
 191              // index of last row that was read matches the index of the last
 192              // row whose value should be returned.
 193              if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) {
 194                  // return empty row if mismatch between last processed row
 195                  // and the row that needs to be returned
 196                  $rowToBeProcessed = new Row([], null);
 197              }
 198          }
 199  
 200          \assert(null !== $rowToBeProcessed);
 201  
 202          return $rowToBeProcessed;
 203      }
 204  
 205      /**
 206       * Return the key of the current element. Here, the row index.
 207       *
 208       * @see http://php.net/manual/en/iterator.key.php
 209       */
 210      public function key(): int
 211      {
 212          // TODO: This should return $this->nextRowIndexToBeProcessed
 213          //       but to avoid a breaking change, the return value for
 214          //       this function has been kept as the number of rows read.
 215          return $this->shouldPreserveEmptyRows ?
 216                  $this->nextRowIndexToBeProcessed :
 217                  $this->numReadRows;
 218      }
 219  
 220      /**
 221       * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml
 222       *
 223       * @return string path of the XML file containing the sheet data,
 224       *                without the leading slash
 225       */
 226      private function normalizeSheetDataXMLFilePath(string $sheetDataXMLFilePath): string
 227      {
 228          return ltrim($sheetDataXMLFilePath, '/');
 229      }
 230  
 231      /**
 232       * Returns whether we need data for the next row to be processed.
 233       * We don't need to read data if:
 234       *   we have already read at least one row
 235       *     AND
 236       *   we need to preserve empty rows
 237       *     AND
 238       *   the last row that was read is not the row that need to be processed
 239       *   (i.e. if we need to return empty rows).
 240       *
 241       * @return bool whether we need data for the next row to be processed
 242       */
 243      private function doesNeedDataForNextRowToBeProcessed(): bool
 244      {
 245          $hasReadAtLeastOneRow = (0 !== $this->lastRowIndexProcessed);
 246  
 247          return
 248              !$hasReadAtLeastOneRow
 249              || !$this->shouldPreserveEmptyRows
 250              || $this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed;
 251      }
 252  
 253      /**
 254       * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found
 255       * @throws \OpenSpout\Common\Exception\IOException                   If unable to read the sheet data XML
 256       */
 257      private function readDataForNextRow(): void
 258      {
 259          $this->currentlyProcessedRow = new Row([], null);
 260  
 261          $this->xmlProcessor->readUntilStopped();
 262  
 263          $this->rowBuffer = $this->currentlyProcessedRow;
 264      }
 265  
 266      /**
 267       * @param XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node
 268       *
 269       * @return int A return code that indicates what action should the processor take next
 270       */
 271      private function processDimensionStartingNode(XMLReader $xmlReader): int
 272      {
 273          // Read dimensions of the sheet
 274          $dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet)
 275          \assert(null !== $dimensionRef);
 276          if (1 === preg_match('/[A-Z]+\d+:([A-Z]+\d+)/', $dimensionRef, $matches)) {
 277              $this->numColumns = CellHelper::getColumnIndexFromCellIndex($matches[1]) + 1;
 278          }
 279  
 280          return XMLProcessor::PROCESSING_CONTINUE;
 281      }
 282  
 283      /**
 284       * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node
 285       *
 286       * @return int A return code that indicates what action should the processor take next
 287       */
 288      private function processRowStartingNode(XMLReader $xmlReader): int
 289      {
 290          // Reset index of the last processed column
 291          $this->lastColumnIndexProcessed = -1;
 292  
 293          // Mark the last processed row as the one currently being read
 294          $this->lastRowIndexProcessed = $this->getRowIndex($xmlReader);
 295  
 296          // Read spans info if present
 297          $numberOfColumnsForRow = $this->numColumns;
 298          $spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance
 299          if (null !== $spans && '' !== $spans) {
 300              [, $numberOfColumnsForRow] = explode(':', $spans);
 301              $numberOfColumnsForRow = (int) $numberOfColumnsForRow;
 302          }
 303  
 304          $cells = array_fill(0, $numberOfColumnsForRow, Cell::fromValue(''));
 305          $this->currentlyProcessedRow->setCells($cells);
 306  
 307          return XMLProcessor::PROCESSING_CONTINUE;
 308      }
 309  
 310      /**
 311       * @param XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node
 312       *
 313       * @return int A return code that indicates what action should the processor take next
 314       */
 315      private function processCellStartingNode(XMLReader $xmlReader): int
 316      {
 317          $currentColumnIndex = $this->getColumnIndex($xmlReader);
 318  
 319          // NOTE: expand() will automatically decode all XML entities of the child nodes
 320          /** @var DOMElement $node */
 321          $node = $xmlReader->expand();
 322          $cell = $this->getCell($node);
 323  
 324          $this->currentlyProcessedRow->setCellAtIndex($cell, $currentColumnIndex);
 325          $this->lastColumnIndexProcessed = $currentColumnIndex;
 326  
 327          return XMLProcessor::PROCESSING_CONTINUE;
 328      }
 329  
 330      /**
 331       * @return int A return code that indicates what action should the processor take next
 332       */
 333      private function processRowEndingNode(): int
 334      {
 335          // if the fetched row is empty and we don't want to preserve it..,
 336          if (!$this->shouldPreserveEmptyRows && $this->currentlyProcessedRow->isEmpty()) {
 337              // ... skip it
 338              return XMLProcessor::PROCESSING_CONTINUE;
 339          }
 340  
 341          ++$this->numReadRows;
 342  
 343          // If needed, we fill the empty cells
 344          if (0 === $this->numColumns) {
 345              $this->rowManager->fillMissingIndexesWithEmptyCells($this->currentlyProcessedRow);
 346          }
 347  
 348          // at this point, we have all the data we need for the row
 349          // so that we can populate the buffer
 350          return XMLProcessor::PROCESSING_STOP;
 351      }
 352  
 353      /**
 354       * @return int A return code that indicates what action should the processor take next
 355       */
 356      private function processWorksheetEndingNode(): int
 357      {
 358          // The closing "</worksheet>" marks the end of the file
 359          $this->hasReachedEndOfFile = true;
 360  
 361          return XMLProcessor::PROCESSING_STOP;
 362      }
 363  
 364      /**
 365       * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" node
 366       *
 367       * @return int Row index
 368       *
 369       * @throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid
 370       */
 371      private function getRowIndex(XMLReader $xmlReader): int
 372      {
 373          // Get "r" attribute if present (from something like <row r="3"...>
 374          $currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX);
 375  
 376          return (null !== $currentRowIndex) ?
 377                  (int) $currentRowIndex :
 378                  $this->lastRowIndexProcessed + 1;
 379      }
 380  
 381      /**
 382       * @param XMLReader $xmlReader XMLReader object, positioned on a "<c>" node
 383       *
 384       * @return int Column index
 385       *
 386       * @throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid
 387       */
 388      private function getColumnIndex(XMLReader $xmlReader): int
 389      {
 390          // Get "r" attribute if present (from something like <c r="A1"...>
 391          $currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX);
 392  
 393          return (null !== $currentCellIndex) ?
 394                  CellHelper::getColumnIndexFromCellIndex($currentCellIndex) :
 395                  $this->lastColumnIndexProcessed + 1;
 396      }
 397  
 398      /**
 399       * Returns the cell with (unescaped) correctly marshalled, cell value associated to the given XML node.
 400       *
 401       * @return Cell The cell set with the associated with the cell
 402       */
 403      private function getCell(DOMElement $node): Cell
 404      {
 405          try {
 406              $cellValue = $this->cellValueFormatter->extractAndFormatNodeValue($node);
 407              $cell = Cell::fromValue($cellValue);
 408          } catch (InvalidValueException $exception) {
 409              $cell = new Cell\ErrorCell($exception->getInvalidValue(), null);
 410          }
 411  
 412          return $cell;
 413      }
 414  }