Differences Between: [Versions 402 and 403]
1 <?php 2 3 declare(strict_types=1); 4 5 namespace OpenSpout\Reader\XLSX; 6 7 use DOMElement; 8 use OpenSpout\Common\Entity\Cell; 9 use OpenSpout\Common\Entity\Row; 10 use OpenSpout\Common\Exception\IOException; 11 use OpenSpout\Reader\Common\Manager\RowManager; 12 use OpenSpout\Reader\Common\XMLProcessor; 13 use OpenSpout\Reader\Exception\InvalidValueException; 14 use OpenSpout\Reader\RowIteratorInterface; 15 use OpenSpout\Reader\Wrapper\XMLReader; 16 use OpenSpout\Reader\XLSX\Helper\CellHelper; 17 use OpenSpout\Reader\XLSX\Helper\CellValueFormatter; 18 19 final class RowIterator implements RowIteratorInterface 20 { 21 /** 22 * Definition of XML nodes names used to parse data. 23 */ 24 public const XML_NODE_DIMENSION = 'dimension'; 25 public const XML_NODE_WORKSHEET = 'worksheet'; 26 public const XML_NODE_ROW = 'row'; 27 public const XML_NODE_CELL = 'c'; 28 29 /** 30 * Definition of XML attributes used to parse data. 31 */ 32 public const XML_ATTRIBUTE_REF = 'ref'; 33 public const XML_ATTRIBUTE_SPANS = 'spans'; 34 public const XML_ATTRIBUTE_ROW_INDEX = 'r'; 35 public const XML_ATTRIBUTE_CELL_INDEX = 'r'; 36 37 /** @var string Path of the XLSX file being read */ 38 private string $filePath; 39 40 /** @var string Path of the sheet data XML file as in [Content_Types].xml */ 41 private string $sheetDataXMLFilePath; 42 43 /** @var XMLReader The XMLReader object that will help read sheet's XML data */ 44 private XMLReader $xmlReader; 45 46 /** @var XMLProcessor Helper Object to process XML nodes */ 47 private XMLProcessor $xmlProcessor; 48 49 /** @var Helper\CellValueFormatter Helper to format cell values */ 50 private Helper\CellValueFormatter $cellValueFormatter; 51 52 /** @var RowManager Manages rows */ 53 private RowManager $rowManager; 54 55 /** 56 * TODO: This variable can be deleted when row indices get preserved. 57 * 58 * @var int Number of read rows 59 */ 60 private int $numReadRows = 0; 61 62 /** @var Row Contains the row currently processed */ 63 private Row $currentlyProcessedRow; 64 65 /** @var null|Row Buffer used to store the current row, while checking if there are more rows to read */ 66 private ?Row $rowBuffer; 67 68 /** @var bool Indicates whether all rows have been read */ 69 private bool $hasReachedEndOfFile = false; 70 71 /** @var int The number of columns the sheet has (0 meaning undefined) */ 72 private int $numColumns = 0; 73 74 /** @var bool Whether empty rows should be returned or skipped */ 75 private bool $shouldPreserveEmptyRows; 76 77 /** @var int Last row index processed (one-based) */ 78 private int $lastRowIndexProcessed = 0; 79 80 /** @var int Row index to be processed next (one-based) */ 81 private int $nextRowIndexToBeProcessed = 0; 82 83 /** @var int Last column index processed (zero-based) */ 84 private int $lastColumnIndexProcessed = -1; 85 86 /** 87 * @param string $filePath Path of the XLSX file being read 88 * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml 89 * @param bool $shouldPreserveEmptyRows Whether empty rows should be preserved 90 * @param XMLReader $xmlReader XML Reader 91 * @param XMLProcessor $xmlProcessor Helper to process XML files 92 * @param CellValueFormatter $cellValueFormatter Helper to format cell values 93 * @param RowManager $rowManager Manages rows 94 */ 95 public function __construct( 96 string $filePath, 97 string $sheetDataXMLFilePath, 98 bool $shouldPreserveEmptyRows, 99 XMLReader $xmlReader, 100 XMLProcessor $xmlProcessor, 101 CellValueFormatter $cellValueFormatter, 102 RowManager $rowManager 103 ) { 104 $this->filePath = $filePath; 105 $this->sheetDataXMLFilePath = $this->normalizeSheetDataXMLFilePath($sheetDataXMLFilePath); 106 $this->shouldPreserveEmptyRows = $shouldPreserveEmptyRows; 107 $this->xmlReader = $xmlReader; 108 $this->cellValueFormatter = $cellValueFormatter; 109 $this->rowManager = $rowManager; 110 111 // Register all callbacks to process different nodes when reading the XML file 112 $this->xmlProcessor = $xmlProcessor; 113 $this->xmlProcessor->registerCallback(self::XML_NODE_DIMENSION, XMLProcessor::NODE_TYPE_START, [$this, 'processDimensionStartingNode']); 114 $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_START, [$this, 'processRowStartingNode']); 115 $this->xmlProcessor->registerCallback(self::XML_NODE_CELL, XMLProcessor::NODE_TYPE_START, [$this, 'processCellStartingNode']); 116 $this->xmlProcessor->registerCallback(self::XML_NODE_ROW, XMLProcessor::NODE_TYPE_END, [$this, 'processRowEndingNode']); 117 $this->xmlProcessor->registerCallback(self::XML_NODE_WORKSHEET, XMLProcessor::NODE_TYPE_END, [$this, 'processWorksheetEndingNode']); 118 } 119 120 /** 121 * Rewind the Iterator to the first element. 122 * Initializes the XMLReader object that reads the associated sheet data. 123 * The XMLReader is configured to be safe from billion laughs attack. 124 * 125 * @see http://php.net/manual/en/iterator.rewind.php 126 * 127 * @throws \OpenSpout\Common\Exception\IOException If the sheet data XML cannot be read 128 */ 129 public function rewind(): void 130 { 131 $this->xmlReader->close(); 132 133 if (false === $this->xmlReader->openFileInZip($this->filePath, $this->sheetDataXMLFilePath)) { 134 throw new IOException("Could not open \"{$this->sheetDataXMLFilePath}\"."); 135 } 136 137 $this->numReadRows = 0; 138 $this->lastRowIndexProcessed = 0; 139 $this->nextRowIndexToBeProcessed = 0; 140 $this->rowBuffer = null; 141 $this->hasReachedEndOfFile = false; 142 $this->numColumns = 0; 143 144 $this->next(); 145 } 146 147 /** 148 * Checks if current position is valid. 149 * 150 * @see http://php.net/manual/en/iterator.valid.php 151 */ 152 public function valid(): bool 153 { 154 $valid = !$this->hasReachedEndOfFile; 155 if (!$valid) { 156 $this->xmlReader->close(); 157 } 158 159 return $valid; 160 } 161 162 /** 163 * Move forward to next element. Reads data describing the next unprocessed row. 164 * 165 * @see http://php.net/manual/en/iterator.next.php 166 * 167 * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found 168 * @throws \OpenSpout\Common\Exception\IOException If unable to read the sheet data XML 169 */ 170 public function next(): void 171 { 172 ++$this->nextRowIndexToBeProcessed; 173 174 if ($this->doesNeedDataForNextRowToBeProcessed()) { 175 $this->readDataForNextRow(); 176 } 177 } 178 179 /** 180 * Return the current element, either an empty row or from the buffer. 181 * 182 * @see http://php.net/manual/en/iterator.current.php 183 */ 184 public function current(): Row 185 { 186 $rowToBeProcessed = $this->rowBuffer; 187 188 if ($this->shouldPreserveEmptyRows) { 189 // when we need to preserve empty rows, we will either return 190 // an empty row or the last row read. This depends whether the 191 // index of last row that was read matches the index of the last 192 // row whose value should be returned. 193 if ($this->lastRowIndexProcessed !== $this->nextRowIndexToBeProcessed) { 194 // return empty row if mismatch between last processed row 195 // and the row that needs to be returned 196 $rowToBeProcessed = new Row([], null); 197 } 198 } 199 200 \assert(null !== $rowToBeProcessed); 201 202 return $rowToBeProcessed; 203 } 204 205 /** 206 * Return the key of the current element. Here, the row index. 207 * 208 * @see http://php.net/manual/en/iterator.key.php 209 */ 210 public function key(): int 211 { 212 // TODO: This should return $this->nextRowIndexToBeProcessed 213 // but to avoid a breaking change, the return value for 214 // this function has been kept as the number of rows read. 215 return $this->shouldPreserveEmptyRows ? 216 $this->nextRowIndexToBeProcessed : 217 $this->numReadRows; 218 } 219 220 /** 221 * @param string $sheetDataXMLFilePath Path of the sheet data XML file as in [Content_Types].xml 222 * 223 * @return string path of the XML file containing the sheet data, 224 * without the leading slash 225 */ 226 private function normalizeSheetDataXMLFilePath(string $sheetDataXMLFilePath): string 227 { 228 return ltrim($sheetDataXMLFilePath, '/'); 229 } 230 231 /** 232 * Returns whether we need data for the next row to be processed. 233 * We don't need to read data if: 234 * we have already read at least one row 235 * AND 236 * we need to preserve empty rows 237 * AND 238 * the last row that was read is not the row that need to be processed 239 * (i.e. if we need to return empty rows). 240 * 241 * @return bool whether we need data for the next row to be processed 242 */ 243 private function doesNeedDataForNextRowToBeProcessed(): bool 244 { 245 $hasReadAtLeastOneRow = (0 !== $this->lastRowIndexProcessed); 246 247 return 248 !$hasReadAtLeastOneRow 249 || !$this->shouldPreserveEmptyRows 250 || $this->lastRowIndexProcessed < $this->nextRowIndexToBeProcessed 251 ; 252 } 253 254 /** 255 * @throws \OpenSpout\Reader\Exception\SharedStringNotFoundException If a shared string was not found 256 * @throws \OpenSpout\Common\Exception\IOException If unable to read the sheet data XML 257 */ 258 private function readDataForNextRow(): void 259 { 260 $this->currentlyProcessedRow = new Row([], null); 261 262 $this->xmlProcessor->readUntilStopped(); 263 264 $this->rowBuffer = $this->currentlyProcessedRow; 265 } 266 267 /** 268 * @param XMLReader $xmlReader XMLReader object, positioned on a "<dimension>" starting node 269 * 270 * @return int A return code that indicates what action should the processor take next 271 */ 272 private function processDimensionStartingNode(XMLReader $xmlReader): int 273 { 274 // Read dimensions of the sheet 275 $dimensionRef = $xmlReader->getAttribute(self::XML_ATTRIBUTE_REF); // returns 'A1:M13' for instance (or 'A1' for empty sheet) 276 \assert(null !== $dimensionRef); 277 if (1 === preg_match('/[A-Z]+\d+:([A-Z]+\d+)/', $dimensionRef, $matches)) { 278 $this->numColumns = CellHelper::getColumnIndexFromCellIndex($matches[1]) + 1; 279 } 280 281 return XMLProcessor::PROCESSING_CONTINUE; 282 } 283 284 /** 285 * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" starting node 286 * 287 * @return int A return code that indicates what action should the processor take next 288 */ 289 private function processRowStartingNode(XMLReader $xmlReader): int 290 { 291 // Reset index of the last processed column 292 $this->lastColumnIndexProcessed = -1; 293 294 // Mark the last processed row as the one currently being read 295 $this->lastRowIndexProcessed = $this->getRowIndex($xmlReader); 296 297 // Read spans info if present 298 $numberOfColumnsForRow = $this->numColumns; 299 $spans = $xmlReader->getAttribute(self::XML_ATTRIBUTE_SPANS); // returns '1:5' for instance 300 if (null !== $spans) { 301 [, $numberOfColumnsForRow] = explode(':', $spans); 302 $numberOfColumnsForRow = (int) $numberOfColumnsForRow; 303 } 304 305 $cells = array_fill(0, $numberOfColumnsForRow, Cell::fromValue('')); 306 $this->currentlyProcessedRow->setCells($cells); 307 308 return XMLProcessor::PROCESSING_CONTINUE; 309 } 310 311 /** 312 * @param XMLReader $xmlReader XMLReader object, positioned on a "<cell>" starting node 313 * 314 * @return int A return code that indicates what action should the processor take next 315 */ 316 private function processCellStartingNode(XMLReader $xmlReader): int 317 { 318 $currentColumnIndex = $this->getColumnIndex($xmlReader); 319 320 // NOTE: expand() will automatically decode all XML entities of the child nodes 321 /** @var DOMElement $node */ 322 $node = $xmlReader->expand(); 323 $cell = $this->getCell($node); 324 325 $this->currentlyProcessedRow->setCellAtIndex($cell, $currentColumnIndex); 326 $this->lastColumnIndexProcessed = $currentColumnIndex; 327 328 return XMLProcessor::PROCESSING_CONTINUE; 329 } 330 331 /** 332 * @return int A return code that indicates what action should the processor take next 333 */ 334 private function processRowEndingNode(): int 335 { 336 // if the fetched row is empty and we don't want to preserve it.., 337 if (!$this->shouldPreserveEmptyRows && $this->currentlyProcessedRow->isEmpty()) { 338 // ... skip it 339 return XMLProcessor::PROCESSING_CONTINUE; 340 } 341 342 ++$this->numReadRows; 343 344 // If needed, we fill the empty cells 345 if (0 === $this->numColumns) { 346 $this->rowManager->fillMissingIndexesWithEmptyCells($this->currentlyProcessedRow); 347 } 348 349 // at this point, we have all the data we need for the row 350 // so that we can populate the buffer 351 return XMLProcessor::PROCESSING_STOP; 352 } 353 354 /** 355 * @return int A return code that indicates what action should the processor take next 356 */ 357 private function processWorksheetEndingNode(): int 358 { 359 // The closing "</worksheet>" marks the end of the file 360 $this->hasReachedEndOfFile = true; 361 362 return XMLProcessor::PROCESSING_STOP; 363 } 364 365 /** 366 * @param XMLReader $xmlReader XMLReader object, positioned on a "<row>" node 367 * 368 * @return int Row index 369 * 370 *@throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid 371 */ 372 private function getRowIndex(XMLReader $xmlReader): int 373 { 374 // Get "r" attribute if present (from something like <row r="3"...> 375 $currentRowIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_ROW_INDEX); 376 377 return (null !== $currentRowIndex) ? 378 (int) $currentRowIndex : 379 $this->lastRowIndexProcessed + 1; 380 } 381 382 /** 383 * @param XMLReader $xmlReader XMLReader object, positioned on a "<c>" node 384 * 385 * @return int Column index 386 * 387 *@throws \OpenSpout\Common\Exception\InvalidArgumentException When the given cell index is invalid 388 */ 389 private function getColumnIndex(XMLReader $xmlReader): int 390 { 391 // Get "r" attribute if present (from something like <c r="A1"...> 392 $currentCellIndex = $xmlReader->getAttribute(self::XML_ATTRIBUTE_CELL_INDEX); 393 394 return (null !== $currentCellIndex) ? 395 CellHelper::getColumnIndexFromCellIndex($currentCellIndex) : 396 $this->lastColumnIndexProcessed + 1; 397 } 398 399 /** 400 * Returns the cell with (unescaped) correctly marshalled, cell value associated to the given XML node. 401 * 402 * @return Cell The cell set with the associated with the cell 403 */ 404 private function getCell(DOMElement $node): Cell 405 { 406 try { 407 $cellValue = $this->cellValueFormatter->extractAndFormatNodeValue($node); 408 $cell = Cell::fromValue($cellValue); 409 } catch (InvalidValueException $exception) { 410 $cell = new Cell\ErrorCell($exception->getInvalidValue(), null); 411 } 412 413 return $cell; 414 } 415 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body