1 <?php 2 3 declare(strict_types=1); 4 5 namespace OpenSpout\Reader\ODS\Helper; 6 7 use DateInterval; 8 use DateTimeImmutable; 9 use DOMElement; 10 use DOMNode; 11 use DOMText; 12 use Exception; 13 use OpenSpout\Common\Helper\Escaper\ODS; 14 use OpenSpout\Reader\Exception\InvalidValueException; 15 16 /** 17 * @internal 18 */ 19 final class CellValueFormatter 20 { 21 /** 22 * Definition of all possible cell types. 23 */ 24 public const CELL_TYPE_STRING = 'string'; 25 public const CELL_TYPE_FLOAT = 'float'; 26 public const CELL_TYPE_BOOLEAN = 'boolean'; 27 public const CELL_TYPE_DATE = 'date'; 28 public const CELL_TYPE_TIME = 'time'; 29 public const CELL_TYPE_CURRENCY = 'currency'; 30 public const CELL_TYPE_PERCENTAGE = 'percentage'; 31 public const CELL_TYPE_VOID = 'void'; 32 33 /** 34 * Definition of XML nodes names used to parse data. 35 */ 36 public const XML_NODE_P = 'p'; 37 public const XML_NODE_TEXT_A = 'text:a'; 38 public const XML_NODE_TEXT_SPAN = 'text:span'; 39 public const XML_NODE_TEXT_S = 'text:s'; 40 public const XML_NODE_TEXT_TAB = 'text:tab'; 41 public const XML_NODE_TEXT_LINE_BREAK = 'text:line-break'; 42 43 /** 44 * Definition of XML attributes used to parse data. 45 */ 46 public const XML_ATTRIBUTE_TYPE = 'office:value-type'; 47 public const XML_ATTRIBUTE_VALUE = 'office:value'; 48 public const XML_ATTRIBUTE_BOOLEAN_VALUE = 'office:boolean-value'; 49 public const XML_ATTRIBUTE_DATE_VALUE = 'office:date-value'; 50 public const XML_ATTRIBUTE_TIME_VALUE = 'office:time-value'; 51 public const XML_ATTRIBUTE_CURRENCY = 'office:currency'; 52 public const XML_ATTRIBUTE_C = 'text:c'; 53 54 /** 55 * List of XML nodes representing whitespaces and their corresponding value. 56 */ 57 private const WHITESPACE_XML_NODES = [ 58 self::XML_NODE_TEXT_S => ' ', 59 self::XML_NODE_TEXT_TAB => "\t", 60 self::XML_NODE_TEXT_LINE_BREAK => "\n", 61 ]; 62 63 /** @var bool Whether date/time values should be returned as PHP objects or be formatted as strings */ 64 private bool $shouldFormatDates; 65 66 /** @var ODS Used to unescape XML data */ 67 private ODS $escaper; 68 69 /** 70 * @param bool $shouldFormatDates Whether date/time values should be returned as PHP objects or be formatted as strings 71 * @param ODS $escaper Used to unescape XML data 72 */ 73 public function __construct(bool $shouldFormatDates, ODS $escaper) 74 { 75 $this->shouldFormatDates = $shouldFormatDates; 76 $this->escaper = $escaper; 77 } 78 79 /** 80 * Returns the (unescaped) correctly marshalled, cell value associated to the given XML node. 81 * 82 * @see http://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#refTable13 83 * 84 * @return bool|DateInterval|DateTimeImmutable|float|int|string The value associated with the cell, empty string if cell's type is void/undefined 85 * 86 * @throws InvalidValueException If the node value is not valid 87 */ 88 public function extractAndFormatNodeValue(DOMElement $node): bool|DateInterval|DateTimeImmutable|float|int|string 89 { 90 $cellType = $node->getAttribute(self::XML_ATTRIBUTE_TYPE); 91 92 return match ($cellType) { 93 self::CELL_TYPE_STRING => $this->formatStringCellValue($node), 94 self::CELL_TYPE_FLOAT => $this->formatFloatCellValue($node), 95 self::CELL_TYPE_BOOLEAN => $this->formatBooleanCellValue($node), 96 self::CELL_TYPE_DATE => $this->formatDateCellValue($node), 97 self::CELL_TYPE_TIME => $this->formatTimeCellValue($node), 98 self::CELL_TYPE_CURRENCY => $this->formatCurrencyCellValue($node), 99 self::CELL_TYPE_PERCENTAGE => $this->formatPercentageCellValue($node), 100 default => '', 101 }; 102 } 103 104 /** 105 * Returns the cell String value. 106 * 107 * @return string The value associated with the cell 108 */ 109 private function formatStringCellValue(DOMElement $node): string 110 { 111 $pNodeValues = []; 112 $pNodes = $node->getElementsByTagName(self::XML_NODE_P); 113 114 foreach ($pNodes as $pNode) { 115 $pNodeValues[] = $this->extractTextValueFromNode($pNode); 116 } 117 118 $escapedCellValue = implode("\n", $pNodeValues); 119 120 return $this->escaper->unescape($escapedCellValue); 121 } 122 123 /** 124 * Returns the cell Numeric value from the given node. 125 * 126 * @return float|int The value associated with the cell 127 */ 128 private function formatFloatCellValue(DOMElement $node): float|int 129 { 130 $nodeValue = $node->getAttribute(self::XML_ATTRIBUTE_VALUE); 131 132 $nodeIntValue = (int) $nodeValue; 133 $nodeFloatValue = (float) $nodeValue; 134 135 return ((float) $nodeIntValue === $nodeFloatValue) ? $nodeIntValue : $nodeFloatValue; 136 } 137 138 /** 139 * Returns the cell Boolean value from the given node. 140 * 141 * @return bool The value associated with the cell 142 */ 143 private function formatBooleanCellValue(DOMElement $node): bool 144 { 145 return (bool) $node->getAttribute(self::XML_ATTRIBUTE_BOOLEAN_VALUE); 146 } 147 148 /** 149 * Returns the cell Date value from the given node. 150 * 151 * @throws InvalidValueException If the value is not a valid date 152 */ 153 private function formatDateCellValue(DOMElement $node): string|DateTimeImmutable 154 { 155 // The XML node looks like this: 156 // <table:table-cell calcext:value-type="date" office:date-value="2016-05-19T16:39:00" office:value-type="date"> 157 // <text:p>05/19/16 04:39 PM</text:p> 158 // </table:table-cell> 159 160 if ($this->shouldFormatDates) { 161 // The date is already formatted in the "p" tag 162 $nodeWithValueAlreadyFormatted = $node->getElementsByTagName(self::XML_NODE_P)->item(0); 163 $cellValue = $nodeWithValueAlreadyFormatted->nodeValue; 164 } else { 165 // otherwise, get it from the "date-value" attribute 166 $nodeValue = $node->getAttribute(self::XML_ATTRIBUTE_DATE_VALUE); 167 168 try { 169 $cellValue = new DateTimeImmutable($nodeValue); 170 } catch (Exception $previous) { 171 throw new InvalidValueException($nodeValue, '', 0, $previous); 172 } 173 } 174 175 return $cellValue; 176 } 177 178 /** 179 * Returns the cell Time value from the given node. 180 * 181 * @return DateInterval|string The value associated with the cell 182 * 183 * @throws InvalidValueException If the value is not a valid time 184 */ 185 private function formatTimeCellValue(DOMElement $node): DateInterval|string 186 { 187 // The XML node looks like this: 188 // <table:table-cell calcext:value-type="time" office:time-value="PT13H24M00S" office:value-type="time"> 189 // <text:p>01:24:00 PM</text:p> 190 // </table:table-cell> 191 192 if ($this->shouldFormatDates) { 193 // The date is already formatted in the "p" tag 194 $nodeWithValueAlreadyFormatted = $node->getElementsByTagName(self::XML_NODE_P)->item(0); 195 $cellValue = $nodeWithValueAlreadyFormatted->nodeValue; 196 } else { 197 // otherwise, get it from the "time-value" attribute 198 $nodeValue = $node->getAttribute(self::XML_ATTRIBUTE_TIME_VALUE); 199 200 try { 201 $cellValue = new DateInterval($nodeValue); 202 } catch (Exception $previous) { 203 throw new InvalidValueException($nodeValue, '', 0, $previous); 204 } 205 } 206 207 return $cellValue; 208 } 209 210 /** 211 * Returns the cell Currency value from the given node. 212 * 213 * @return string The value associated with the cell (e.g. "100 USD" or "9.99 EUR") 214 */ 215 private function formatCurrencyCellValue(DOMElement $node): string 216 { 217 $value = $node->getAttribute(self::XML_ATTRIBUTE_VALUE); 218 $currency = $node->getAttribute(self::XML_ATTRIBUTE_CURRENCY); 219 220 return "{$value} {$currency}"; 221 } 222 223 /** 224 * Returns the cell Percentage value from the given node. 225 * 226 * @return float|int The value associated with the cell 227 */ 228 private function formatPercentageCellValue(DOMElement $node): float|int 229 { 230 // percentages are formatted like floats 231 return $this->formatFloatCellValue($node); 232 } 233 234 private function extractTextValueFromNode(DOMNode $pNode): string 235 { 236 $textValue = ''; 237 238 foreach ($pNode->childNodes as $childNode) { 239 if ($childNode instanceof DOMText) { 240 $textValue .= $childNode->nodeValue; 241 } elseif ($this->isWhitespaceNode($childNode->nodeName) && $childNode instanceof DOMElement) { 242 $textValue .= $this->transformWhitespaceNode($childNode); 243 } elseif (self::XML_NODE_TEXT_A === $childNode->nodeName || self::XML_NODE_TEXT_SPAN === $childNode->nodeName) { 244 $textValue .= $this->extractTextValueFromNode($childNode); 245 } 246 } 247 248 return $textValue; 249 } 250 251 /** 252 * Returns whether the given node is a whitespace node. It must be one of these: 253 * - <text:s /> 254 * - <text:tab /> 255 * - <text:line-break />. 256 */ 257 private function isWhitespaceNode(string $nodeName): bool 258 { 259 return isset(self::WHITESPACE_XML_NODES[$nodeName]); 260 } 261 262 /** 263 * The "<text:p>" node can contain the string value directly 264 * or contain child elements. In this case, whitespaces contain in 265 * the child elements should be replaced by their XML equivalent: 266 * - space => <text:s /> 267 * - tab => <text:tab /> 268 * - line break => <text:line-break />. 269 * 270 * @see https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#__RefHeading__1415200_253892949 271 * 272 * @param DOMElement $node The XML node representing a whitespace 273 * 274 * @return string The corresponding whitespace value 275 */ 276 private function transformWhitespaceNode(DOMElement $node): string 277 { 278 $countAttribute = $node->getAttribute(self::XML_ATTRIBUTE_C); // only defined for "<text:s>" 279 $numWhitespaces = '' !== $countAttribute ? (int) $countAttribute : 1; 280 281 return str_repeat(self::WHITESPACE_XML_NODES[$node->nodeName], $numWhitespaces); 282 } 283 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body