1 <?php 2 3 // if want to implement error collecting here, we'll need to use some sort 4 // of global data (probably trigger_error) because it's impossible to pass 5 // $config or $context to the callback functions. 6 7 /** 8 * Handles referencing and derefencing character entities 9 */ 10 class HTMLPurifier_EntityParser 11 { 12 13 /** 14 * Reference to entity lookup table. 15 * @type HTMLPurifier_EntityLookup 16 */ 17 protected $_entity_lookup; 18 19 /** 20 * Callback regex string for entities in text. 21 * @type string 22 */ 23 protected $_textEntitiesRegex; 24 25 /** 26 * Callback regex string for entities in attributes. 27 * @type string 28 */ 29 protected $_attrEntitiesRegex; 30 31 /** 32 * Tests if the beginning of a string is a semi-optional regex 33 */ 34 protected $_semiOptionalPrefixRegex; 35 36 public function __construct() { 37 // From 38 // http://stackoverflow.com/questions/15532252/why-is-reg-being-rendered-as-without-the-bounding-semicolon 39 $semi_optional = "quot|QUOT|lt|LT|gt|GT|amp|AMP|AElig|Aacute|Acirc|Agrave|Aring|Atilde|Auml|COPY|Ccedil|ETH|Eacute|Ecirc|Egrave|Euml|Iacute|Icirc|Igrave|Iuml|Ntilde|Oacute|Ocirc|Ograve|Oslash|Otilde|Ouml|REG|THORN|Uacute|Ucirc|Ugrave|Uuml|Yacute|aacute|acirc|acute|aelig|agrave|aring|atilde|auml|brvbar|ccedil|cedil|cent|copy|curren|deg|divide|eacute|ecirc|egrave|eth|euml|frac12|frac14|frac34|iacute|icirc|iexcl|igrave|iquest|iuml|laquo|macr|micro|middot|nbsp|not|ntilde|oacute|ocirc|ograve|ordf|ordm|oslash|otilde|ouml|para|plusmn|pound|raquo|reg|sect|shy|sup1|sup2|sup3|szlig|thorn|times|uacute|ucirc|ugrave|uml|uuml|yacute|yen|yuml"; 40 41 // NB: three empty captures to put the fourth match in the right 42 // place 43 $this->_semiOptionalPrefixRegex = "/&()()()($semi_optional)/"; 44 45 $this->_textEntitiesRegex = 46 '/&(?:'. 47 // hex 48 '[#]x([a-fA-F0-9]+);?|'. 49 // dec 50 '[#]0*(\d+);?|'. 51 // string (mandatory semicolon) 52 // NB: order matters: match semicolon preferentially 53 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. 54 // string (optional semicolon) 55 "($semi_optional)". 56 ')/'; 57 58 $this->_attrEntitiesRegex = 59 '/&(?:'. 60 // hex 61 '[#]x([a-fA-F0-9]+);?|'. 62 // dec 63 '[#]0*(\d+);?|'. 64 // string (mandatory semicolon) 65 // NB: order matters: match semicolon preferentially 66 '([A-Za-z_:][A-Za-z0-9.\-_:]*);|'. 67 // string (optional semicolon) 68 // don't match if trailing is equals or alphanumeric (URL 69 // like) 70 "($semi_optional)(?![=;A-Za-z0-9])". 71 ')/'; 72 73 } 74 75 /** 76 * Substitute entities with the parsed equivalents. Use this on 77 * textual data in an HTML document (as opposed to attributes.) 78 * 79 * @param string $string String to have entities parsed. 80 * @return string Parsed string. 81 */ 82 public function substituteTextEntities($string) 83 { 84 return preg_replace_callback( 85 $this->_textEntitiesRegex, 86 array($this, 'entityCallback'), 87 $string 88 ); 89 } 90 91 /** 92 * Substitute entities with the parsed equivalents. Use this on 93 * attribute contents in documents. 94 * 95 * @param string $string String to have entities parsed. 96 * @return string Parsed string. 97 */ 98 public function substituteAttrEntities($string) 99 { 100 return preg_replace_callback( 101 $this->_attrEntitiesRegex, 102 array($this, 'entityCallback'), 103 $string 104 ); 105 } 106 107 /** 108 * Callback function for substituteNonSpecialEntities() that does the work. 109 * 110 * @param array $matches PCRE matches array, with 0 the entire match, and 111 * either index 1, 2 or 3 set with a hex value, dec value, 112 * or string (respectively). 113 * @return string Replacement string. 114 */ 115 116 protected function entityCallback($matches) 117 { 118 $entity = $matches[0]; 119 $hex_part = @$matches[1]; 120 $dec_part = @$matches[2]; 121 $named_part = empty($matches[3]) ? (empty($matches[4]) ? "" : $matches[4]) : $matches[3]; 122 if ($hex_part !== NULL && $hex_part !== "") { 123 return HTMLPurifier_Encoder::unichr(hexdec($hex_part)); 124 } elseif ($dec_part !== NULL && $dec_part !== "") { 125 return HTMLPurifier_Encoder::unichr((int) $dec_part); 126 } else { 127 if (!$this->_entity_lookup) { 128 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 129 } 130 if (isset($this->_entity_lookup->table[$named_part])) { 131 return $this->_entity_lookup->table[$named_part]; 132 } else { 133 // exact match didn't match anything, so test if 134 // any of the semicolon optional match the prefix. 135 // Test that this is an EXACT match is important to 136 // prevent infinite loop 137 if (!empty($matches[3])) { 138 return preg_replace_callback( 139 $this->_semiOptionalPrefixRegex, 140 array($this, 'entityCallback'), 141 $entity 142 ); 143 } 144 return $entity; 145 } 146 } 147 } 148 149 // LEGACY CODE BELOW 150 151 /** 152 * Callback regex string for parsing entities. 153 * @type string 154 */ 155 protected $_substituteEntitiesRegex = 156 '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/'; 157 // 1. hex 2. dec 3. string (XML style) 158 159 /** 160 * Decimal to parsed string conversion table for special entities. 161 * @type array 162 */ 163 protected $_special_dec2str = 164 array( 165 34 => '"', 166 38 => '&', 167 39 => "'", 168 60 => '<', 169 62 => '>' 170 ); 171 172 /** 173 * Stripped entity names to decimal conversion table for special entities. 174 * @type array 175 */ 176 protected $_special_ent2dec = 177 array( 178 'quot' => 34, 179 'amp' => 38, 180 'lt' => 60, 181 'gt' => 62 182 ); 183 184 /** 185 * Substitutes non-special entities with their parsed equivalents. Since 186 * running this whenever you have parsed character is t3h 5uck, we run 187 * it before everything else. 188 * 189 * @param string $string String to have non-special entities parsed. 190 * @return string Parsed string. 191 */ 192 public function substituteNonSpecialEntities($string) 193 { 194 // it will try to detect missing semicolons, but don't rely on it 195 return preg_replace_callback( 196 $this->_substituteEntitiesRegex, 197 array($this, 'nonSpecialEntityCallback'), 198 $string 199 ); 200 } 201 202 /** 203 * Callback function for substituteNonSpecialEntities() that does the work. 204 * 205 * @param array $matches PCRE matches array, with 0 the entire match, and 206 * either index 1, 2 or 3 set with a hex value, dec value, 207 * or string (respectively). 208 * @return string Replacement string. 209 */ 210 211 protected function nonSpecialEntityCallback($matches) 212 { 213 // replaces all but big five 214 $entity = $matches[0]; 215 $is_num = (@$matches[0][1] === '#'); 216 if ($is_num) { 217 $is_hex = (@$entity[2] === 'x'); 218 $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 219 // abort for special characters 220 if (isset($this->_special_dec2str[$code])) { 221 return $entity; 222 } 223 return HTMLPurifier_Encoder::unichr($code); 224 } else { 225 if (isset($this->_special_ent2dec[$matches[3]])) { 226 return $entity; 227 } 228 if (!$this->_entity_lookup) { 229 $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); 230 } 231 if (isset($this->_entity_lookup->table[$matches[3]])) { 232 return $this->_entity_lookup->table[$matches[3]]; 233 } else { 234 return $entity; 235 } 236 } 237 } 238 239 /** 240 * Substitutes only special entities with their parsed equivalents. 241 * 242 * @notice We try to avoid calling this function because otherwise, it 243 * would have to be called a lot (for every parsed section). 244 * 245 * @param string $string String to have non-special entities parsed. 246 * @return string Parsed string. 247 */ 248 public function substituteSpecialEntities($string) 249 { 250 return preg_replace_callback( 251 $this->_substituteEntitiesRegex, 252 array($this, 'specialEntityCallback'), 253 $string 254 ); 255 } 256 257 /** 258 * Callback function for substituteSpecialEntities() that does the work. 259 * 260 * This callback has same syntax as nonSpecialEntityCallback(). 261 * 262 * @param array $matches PCRE-style matches array, with 0 the entire match, and 263 * either index 1, 2 or 3 set with a hex value, dec value, 264 * or string (respectively). 265 * @return string Replacement string. 266 */ 267 protected function specialEntityCallback($matches) 268 { 269 $entity = $matches[0]; 270 $is_num = (@$matches[0][1] === '#'); 271 if ($is_num) { 272 $is_hex = (@$entity[2] === 'x'); 273 $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; 274 return isset($this->_special_dec2str[$int]) ? 275 $this->_special_dec2str[$int] : 276 $entity; 277 } else { 278 return isset($this->_special_ent2dec[$matches[3]]) ? 279 $this->_special_dec2str[$this->_special_ent2dec[$matches[3]]] : 280 $entity; 281 } 282 } 283 } 284 285 // vim: et sw=4 sts=4
title
Description
Body
title
Description
Body
title
Description
Body
title
Body