Differences Between: [Versions 402 and 403]
1 <?php 2 3 declare(strict_types=1); 4 5 namespace OpenSpout\Reader\XLSX; 6 7 use DOMElement; 8 use OpenSpout\Common\Entity\Cell; 9 use OpenSpout\Common\Entity\Row; 10 use OpenSpout\Common\Exception\IOException; 11 use OpenSpout\Reader\Common\Manager\RowManager; 12 use OpenSpout\Reader\Common\XMLProcessor; 13 use OpenSpout\Reader\Exception\InvalidValueException; 14 use OpenSpout\Reader\RowIteratorInterface; 15 use OpenSpout\Reader\Wrapper\XMLReader; 16 use OpenSpout\Reader\XLSX\Helper\CellHelper; 17 use OpenSpout\Reader\XLSX\Helper\CellValueFormatter; 18 19 final class RowIterator implements RowIteratorInterface 20 { 21 /** 22 * Definition of XML nodes names used to parse data. 23 */ 24 public const XML_NODE_DIMENSION = 'dimension'; 25 public const XML_NODE_WORKSHEET = 'worksheet'; 26 public const XML_NODE_ROW = 'row'; 27 public const XML_NODE_CELL = 'c'; 28 29 /** 30 * Definition of XML attributes used to parse data. 31 */ 32 public const XML_ATTRIBUTE_REF = 'ref'; 33 public const XML_ATTRIBUTE_SPANS = 'spans'; 34 public const XML_ATTRIBUTE_ROW_INDEX = 'r'; 35 public const XML_ATTRIBUTE_CELL_INDEX = 'r'; 36 37 /** @var string Path of the XLSX file being read */ 38 private string $filePath; 39 40 /** @var string Path of the sheet data XML file as in [Content_Types].xml */ 41 private string $sheetDataXMLFilePath; 42 43 /** @var XMLReader The XMLReader object that will help read sheet's XML data */ 44 private XMLReader $xmlReader; 45 46 /** @var XMLProcessor Helper Object to process XML nodes */ 47 private XMLProcessor $xmlProcessor; 48 49 /** @var Helper\CellValueFormatter Helper to format cell values */ 50 private Helper\CellValueFormatter $cellValueFormatter; 51 52 /** @var RowManager Manages rows */ 53 private RowManager $rowManager; 54 55 /** 56 * TODO: This variable can be deleted when row indices get preserved. 57 * 58 * @var int Number of read rows 59 */ 60 private int $numReadRows = 0; 61 62 /** @var Row Contains the row currently processed */ 63 private Row $currentlyProcessedRow; 64 65 /** @var null|Row Buffer used to store the current row, while checking if there are more rows to read */ 66 private ?Row $rowBuffer; 67 68 /** @var bool Indicates whether all rows have been read */ 69 private bool $hasReachedEndOfFile = false; 70 71 /** @var int The number of columns the sheet has (0 meaning undefined) */ 72 private int $numColumns = 0; 73 74 /** @var bool Whether empty rows should be returned or skipped */ 75 private bool $shouldPreserveEmptyRows; 76 77 /** @var int Last row index processed (one-based) */ 78 private int $lastRowIndexProcessed = 0; 79 80 /** @var int Row index to be processed next (one-based) */ 81 private int $nextRowIndexToBeProcessed = 0; 82 83 /** @var int Last column index processed (zero-based) */ 84 private int $lastColumnIndexProcessed = -1; 85 86 /** 87 * @param string $filePath Path of the XLSX file being read 88 * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml 89 * @param bool $shouldPreserveEmptyRows Whether empty rows should be preserved 90 * @param XMLReader $xmlReader XML Reader 91 * @param XMLProcessor $xmlProcessor Helper to process XML files 92 * @param CellValueFormatter $cellValueFormatter Helper to format cell values 93 * @param RowManager $rowManager Manages rows 94 */ 95 public function __construct( 96 string $filePath, 97 string $sheetDataXMLFilePath, 98 bool $shouldPreserveEmptyRows, 99 XMLReader $xmlReader, 100 XMLProcessor $xmlProcessor, 101 CellValueFormatter $cellValueFormatter, 102 RowManager $rowManager 103 ) { 104 $this->filePath = $filePath; 105 $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); 106 $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows; 107 $this->xmlReader = $xmlReader; 108 $this->cellValueFormatter = $cellValueFormatter; 109 $this->rowManager = $rowManager; 110 111 // Register all callbacks to process different nodes when reading the XML file 112 $this->xmlProcessor = $xmlProcessor; 113 $this->xmlProcessor->registerCallback(self::XML_NODE_DIMENSION, XMLProcessor::NODE_TYPE_START, [$this, 'processDimensionStartingNode']); 114 $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']); 115 $this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']); 116 $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']); 117 $this->xmlProcessor->registerCallback(self::XML_NODE_WORKSHEET, XMLProcessor::NODE_TYPE_END, [$this, 'processWorksheetEndingNode']); 118 } 119 120 /** 121 * Rewind the Iterator to the first element. 122 * Initializes the XMLReader object that reads the associated sheet data. 123 * The XMLReader is configured to be safe from billion laughs attack. 124 * 125 * @see http://php.net/manual/en/iterator.rewind.php 126 * 127 * @throws \OpenSpout\Common\Exception\IOException If the sheet data XML cannot be read 128 */ 129 public function rewind(): void 130 { 131 $this->xmlReader->close(); 132 133 if (false === $this->xmlReader->openFileInZip($this->filePath, $this->sheetDataXMLFilePath)) { 134 throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\"."); 135 } 136 137 $this->numReadRows = 0; 138 $this->lastRowIndexProcessed = 0; 139 $this->nextRowIndexToBeProcessed = 0; 140 $this->rowBuffer = null; 141 $this->hasReachedEndOfFile = false; 142 $this->numColumns = 0; 143 144 $this->next(); 145 } 146 147 /** 148 * Checks if current position is valid. 149 * 150 * @see http://php.net/manual/en/iterator.valid.php 151 */ 152 public function valid(): bool 153 { 154 $valid = !$this->hasReachedEndOfFile; 155 if (!$valid) { 156 $this->xmlReader->close(); 157 } 158 159 return $valid; 160 } 161 162 /** 163 * Move forward to next element. Reads data describing the next unprocessed row. 164 * 165 * @see http://php.net/manual/en/iterator.next.php 166 * 167 * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found 168 * @throws \OpenSpout\Common\Exception\IOException If unable to read the sheet data XML 169 */ 170 public function next(): void 171 { 172 ++$this->nextRowIndexToBeProcessed; 173 174 if ($this->doesNeedDataForNextRowToBeProcessed()) { 175 $this->readDataForNextRow(); 176 } 177 } 178 179 /** 180 * Return the current element, either an empty row or from the buffer. 181 * 182 * @see http://php.net/manual/en/iterator.current.php 183 */ 184 public function current(): Row 185 { 186 $rowToBeProcessed = $this->rowBuffer; 187 188 if ($this->shouldPreserveEmptyRows) { 189 // when we need to preserve empty rows, we will either return 190 // an empty row or the last row read. This depends whether the 191 // index of last row that was read matches the index of the last 192 // row whose value should be returned. 193 if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) { 194 // return empty row if mismatch between last processed row 195 // and the row that needs to be returned 196 $rowToBeProcessed = new Row([], null); 197 } 198 } 199 200 \assert(null !== $rowToBeProcessed); 201 202 return $rowToBeProcessed; 203 } 204 205 /** 206 * Return the key of the current element. Here, the row index. 207 * 208 * @see http://php.net/manual/en/iterator.key.php 209 */ 210 public function key(): int 211 { 212 // TODO: This should return $this->nextRowIndexToBeProcessed 213 // but to avoid a breaking change, the return value for 214 // this function has been kept as the number of rows read. 215 return $this->shouldPreserveEmptyRows ? 216 $this->nextRowIndexToBeProcessed : 217 $this->numReadRows; 218 } 219 220 /** 221 * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml 222 * 223 * @return string path of the XML file containing the sheet data, 224 * without the leading slash 225 */ 226 private function normalizeSheetDataXMLFilePath(string $sheetDataXMLFilePath): string 227 { 228 return ltrim($sheetDataXMLFilePath, '/'); 229 } 230 231 /** 232 * Returns whether we need data for the next row to be processed. 233 * We don't need to read data if: 234 * we have already read at least one row 235 * AND 236 * we need to preserve empty rows 237 * AND 238 * the last row that was read is not the row that need to be processed 239 * (i.e. if we need to return empty rows). 240 * 241 * @return bool whether we need data for the next row to be processed 242 */ 243 private function doesNeedDataForNextRowToBeProcessed(): bool 244 { 245 $hasReadAtLeastOneRow = (0 !== $this->lastRowIndexProcessed); 246 247 return 248 !$hasReadAtLeastOneRow 249 || !$this->shouldPreserveEmptyRows 250 || $this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed; 251 } 252 253 /** 254 * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found 255 * @throws \OpenSpout\Common\Exception\IOException If unable to read the sheet data XML 256 */ 257 private function readDataForNextRow(): void 258 { 259 $this->currentlyProcessedRow = new Row([], null); 260 261 $this->xmlProcessor->readUntilStopped(); 262 263 $this->rowBuffer = $this->currentlyProcessedRow; 264 } 265 266 /** 267 * @param XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node 268 * 269 * @return int A return code that indicates what action should the processor take next 270 */ 271 private function processDimensionStartingNode(XMLReader $xmlReader): int 272 { 273 // Read dimensions of the sheet 274 $dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) 275 \assert(null !== $dimensionRef); 276 if (1 === preg_match('/[A-Z]+\d+:([A-Z]+\d+)/', $dimensionRef, $matches)) { 277 $this->numColumns = CellHelper::getColumnIndexFromCellIndex($matches[1]) + 1; 278 } 279 280 return XMLProcessor::PROCESSING_CONTINUE; 281 } 282 283 /** 284 * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node 285 * 286 * @return int A return code that indicates what action should the processor take next 287 */ 288 private function processRowStartingNode(XMLReader $xmlReader): int 289 { 290 // Reset index of the last processed column 291 $this->lastColumnIndexProcessed = -1; 292 293 // Mark the last processed row as the one currently being read 294 $this->lastRowIndexProcessed = $this->getRowIndex($xmlReader); 295 296 // Read spans info if present 297 $numberOfColumnsForRow = $this->numColumns; 298 $spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance 299 if (null !== $spans && '' !== $spans) { 300 [, $numberOfColumnsForRow] = explode(':', $spans); 301 $numberOfColumnsForRow = (int) $numberOfColumnsForRow; 302 } 303 304 $cells = array_fill(0, $numberOfColumnsForRow, Cell::fromValue('')); 305 $this->currentlyProcessedRow->setCells($cells); 306 307 return XMLProcessor::PROCESSING_CONTINUE; 308 } 309 310 /** 311 * @param XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node 312 * 313 * @return int A return code that indicates what action should the processor take next 314 */ 315 private function processCellStartingNode(XMLReader $xmlReader): int 316 { 317 $currentColumnIndex = $this->getColumnIndex($xmlReader); 318 319 // NOTE: expand() will automatically decode all XML entities of the child nodes 320 /** @var DOMElement $node */ 321 $node = $xmlReader->expand(); 322 $cell = $this->getCell($node); 323 324 $this->currentlyProcessedRow->setCellAtIndex($cell, $currentColumnIndex); 325 $this->lastColumnIndexProcessed = $currentColumnIndex; 326 327 return XMLProcessor::PROCESSING_CONTINUE; 328 } 329 330 /** 331 * @return int A return code that indicates what action should the processor take next 332 */ 333 private function processRowEndingNode(): int 334 { 335 // if the fetched row is empty and we don't want to preserve it.., 336 if (!$this->shouldPreserveEmptyRows && $this->currentlyProcessedRow->isEmpty()) { 337 // ... skip it 338 return XMLProcessor::PROCESSING_CONTINUE; 339 } 340 341 ++$this->numReadRows; 342 343 // If needed, we fill the empty cells 344 if (0 === $this->numColumns) { 345 $this->rowManager->fillMissingIndexesWithEmptyCells($this->currentlyProcessedRow); 346 } 347 348 // at this point, we have all the data we need for the row 349 // so that we can populate the buffer 350 return XMLProcessor::PROCESSING_STOP; 351 } 352 353 /** 354 * @return int A return code that indicates what action should the processor take next 355 */ 356 private function processWorksheetEndingNode(): int 357 { 358 // The closing "</worksheet>" marks the end of the file 359 $this->hasReachedEndOfFile = true; 360 361 return XMLProcessor::PROCESSING_STOP; 362 } 363 364 /** 365 * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" node 366 * 367 * @return int Row index 368 * 369 * @throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid 370 */ 371 private function getRowIndex(XMLReader $xmlReader): int 372 { 373 // Get "r" attribute if present (from something like <row r="3"...> 374 $currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX); 375 376 return (null !== $currentRowIndex) ? 377 (int) $currentRowIndex : 378 $this->lastRowIndexProcessed + 1; 379 } 380 381 /** 382 * @param XMLReader $xmlReader XMLReader object, positioned on a "<c>" node 383 * 384 * @return int Column index 385 * 386 * @throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid 387 */ 388 private function getColumnIndex(XMLReader $xmlReader): int 389 { 390 // Get "r" attribute if present (from something like <c r="A1"...> 391 $currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX); 392 393 return (null !== $currentCellIndex) ? 394 CellHelper::getColumnIndexFromCellIndex($currentCellIndex) : 395 $this->lastColumnIndexProcessed + 1; 396 } 397 398 /** 399 * Returns the cell with (unescaped) correctly marshalled, cell value associated to the given XML node. 400 * 401 * @return Cell The cell set with the associated with the cell 402 */ 403 private function getCell(DOMElement $node): Cell 404 { 405 try { 406 $cellValue = $this->cellValueFormatter->extractAndFormatNodeValue($node); 407 $cell = Cell::fromValue($cellValue); 408 } catch (InvalidValueException $exception) { 409 $cell = new Cell\ErrorCell($exception->getInvalidValue(), null); 410 } 411 412 return $cell; 413 } 414 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body