See Release Notes
Long Term Support Release
1 <?php 2 3 /** 4 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer in the documentation and/or other materials provided 17 * with the distribution. 18 * 19 * * Neither the names of David R. Nadeau or NadeauSoftware.com, nor 20 * the names of its contributors may be used to endorse or promote 21 * products derived from this software without specific prior 22 * written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 28 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY 35 * OF SUCH DAMAGE. 36 */ 37 38 /* 39 * This is a BSD License approved by the Open Source Initiative (OSI). 40 * See: http://www.opensource.org/licenses/bsd-license.php 41 */ 42 43 defined('MOODLE_INTERNAL') || die(); 44 45 /** 46 * Combine a base URL and a relative URL to produce a new 47 * absolute URL. The base URL is often the URL of a page, 48 * and the relative URL is a URL embedded on that page. 49 * 50 * This function implements the "absolutize" algorithm from 51 * the RFC3986 specification for URLs. 52 * 53 * This function supports multi-byte characters with the UTF-8 encoding, 54 * per the URL specification. 55 * 56 * Parameters: 57 * baseUrl the absolute base URL. 58 * 59 * url the relative URL to convert. 60 * 61 * Return values: 62 * An absolute URL that combines parts of the base and relative 63 * URLs, or FALSE if the base URL is not absolute or if either 64 * URL cannot be parsed. 65 */ 66 function url_to_absolute( $baseUrl, $relativeUrl ) 67 { 68 // If relative URL has a scheme, clean path and return. 69 $r = split_url( $relativeUrl ); 70 if ( $r === FALSE ) 71 return FALSE; 72 if ( !empty( $r['scheme'] ) ) 73 { 74 if ( !empty( $r['path'] ) && $r['path'][0] == '/' ) 75 $r['path'] = url_remove_dot_segments( $r['path'] ); 76 return join_url( $r ); 77 } 78 79 // Make sure the base URL is absolute. 80 $b = split_url( $baseUrl ); 81 if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) ) 82 return FALSE; 83 $r['scheme'] = $b['scheme']; 84 if (empty($b['path'])) { 85 $b['path'] = ''; 86 } 87 88 // If relative URL has an authority, clean path and return. 89 if ( isset( $r['host'] ) ) 90 { 91 if ( !empty( $r['path'] ) ) 92 $r['path'] = url_remove_dot_segments( $r['path'] ); 93 return join_url( $r ); 94 } 95 unset( $r['port'] ); 96 unset( $r['user'] ); 97 unset( $r['pass'] ); 98 99 // Copy base authority. 100 $r['host'] = $b['host']; 101 if ( isset( $b['port'] ) ) $r['port'] = $b['port']; 102 if ( isset( $b['user'] ) ) $r['user'] = $b['user']; 103 if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass']; 104 105 // If relative URL has no path, use base path 106 if ( empty( $r['path'] ) ) 107 { 108 if ( !empty( $b['path'] ) ) 109 $r['path'] = $b['path']; 110 if ( !isset( $r['query'] ) && isset( $b['query'] ) ) 111 $r['query'] = $b['query']; 112 return join_url( $r ); 113 } 114 115 // If relative URL path doesn't start with /, merge with base path. 116 if ($r['path'][0] != '/') { 117 $base = core_text::strrchr($b['path'], '/', TRUE); 118 if ($base === FALSE) { 119 $base = ''; 120 } 121 $r['path'] = $base . '/' . $r['path']; 122 } 123 $r['path'] = url_remove_dot_segments($r['path']); 124 return join_url($r); 125 } 126 127 /** 128 * Filter out "." and ".." segments from a URL's path and return 129 * the result. 130 * 131 * This function implements the "remove_dot_segments" algorithm from 132 * the RFC3986 specification for URLs. 133 * 134 * This function supports multi-byte characters with the UTF-8 encoding, 135 * per the URL specification. 136 * 137 * Parameters: 138 * path the path to filter 139 * 140 * Return values: 141 * The filtered path with "." and ".." removed. 142 */ 143 function url_remove_dot_segments( $path ) 144 { 145 // multi-byte character explode 146 $inSegs = preg_split( '!/!u', $path ); 147 $outSegs = array( ); 148 foreach ( $inSegs as $seg ) 149 { 150 if ( $seg == '' || $seg == '.') 151 continue; 152 if ( $seg == '..' ) 153 array_pop( $outSegs ); 154 else 155 array_push( $outSegs, $seg ); 156 } 157 $outPath = implode( '/', $outSegs ); 158 159 if ($path[0] == '/') { 160 $outPath = '/' . $outPath; 161 } 162 163 // Compare last multi-byte character against '/'. 164 if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) { 165 $outPath .= '/'; 166 } 167 return $outPath; 168 } 169 170 /** 171 * This function parses an absolute or relative URL and splits it 172 * into individual components. 173 * 174 * RFC3986 specifies the components of a Uniform Resource Identifier (URI). 175 * A portion of the ABNFs are repeated here: 176 * 177 * URI-reference = URI 178 * / relative-ref 179 * 180 * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 181 * 182 * relative-ref = relative-part [ "?" query ] [ "#" fragment ] 183 * 184 * hier-part = "//" authority path-abempty 185 * / path-absolute 186 * / path-rootless 187 * / path-empty 188 * 189 * relative-part = "//" authority path-abempty 190 * / path-absolute 191 * / path-noscheme 192 * / path-empty 193 * 194 * authority = [ userinfo "@" ] host [ ":" port ] 195 * 196 * So, a URL has the following major components: 197 * 198 * scheme 199 * The name of a method used to interpret the rest of 200 * the URL. Examples: "http", "https", "mailto", "file'. 201 * 202 * authority 203 * The name of the authority governing the URL's name 204 * space. Examples: "example.com", "user@example.com", 205 * "example.com:80", "user:password@example.com:80". 206 * 207 * The authority may include a host name, port number, 208 * user name, and password. 209 * 210 * The host may be a name, an IPv4 numeric address, or 211 * an IPv6 numeric address. 212 * 213 * path 214 * The hierarchical path to the URL's resource. 215 * Examples: "/index.htm", "/scripts/page.php". 216 * 217 * query 218 * The data for a query. Examples: "?search=google.com". 219 * 220 * fragment 221 * The name of a secondary resource relative to that named 222 * by the path. Examples: "#section1", "#header". 223 * 224 * An "absolute" URL must include a scheme and path. The authority, query, 225 * and fragment components are optional. 226 * 227 * A "relative" URL does not include a scheme and must include a path. The 228 * authority, query, and fragment components are optional. 229 * 230 * This function splits the $url argument into the following components 231 * and returns them in an associative array. Keys to that array include: 232 * 233 * "scheme" The scheme, such as "http". 234 * "host" The host name, IPv4, or IPv6 address. 235 * "port" The port number. 236 * "user" The user name. 237 * "pass" The user password. 238 * "path" The path, such as a file path for "http". 239 * "query" The query. 240 * "fragment" The fragment. 241 * 242 * One or more of these may not be present, depending upon the URL. 243 * 244 * Optionally, the "user", "pass", "host" (if a name, not an IP address), 245 * "path", "query", and "fragment" may have percent-encoded characters 246 * decoded. The "scheme" and "port" cannot include percent-encoded 247 * characters and are never decoded. Decoding occurs after the URL has 248 * been parsed. 249 * 250 * Parameters: 251 * url the URL to parse. 252 * 253 * decode an optional boolean flag selecting whether 254 * to decode percent encoding or not. Default = TRUE. 255 * 256 * Return values: 257 * the associative array of URL parts, or FALSE if the URL is 258 * too malformed to recognize any parts. 259 */ 260 function split_url( $url, $decode=FALSE) 261 { 262 // Character sets from RFC3986. 263 $xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;='; 264 $xpchar = $xunressub . ':@% '; 265 266 // Scheme from RFC3986. 267 $xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)'; 268 269 // User info (user + password) from RFC3986. 270 $xuserinfo = '(([' . $xunressub . '%]*)' . 271 '(:([' . $xunressub . ':%]*))?)'; 272 273 // IPv4 from RFC3986 (without digit constraints). 274 $xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'; 275 276 // IPv6 from RFC2732 (without digit and grouping constraints). 277 $xipv6 = '(\[([a-fA-F\d.:]+)\])'; 278 279 // Host name from RFC1035. Technically, must start with a letter. 280 // Relax that restriction to better parse URL structure, then 281 // leave host name validation to application. 282 $xhost_name = '([a-zA-Z\d\-.%]+)'; 283 284 // Authority from RFC3986. Skip IP future. 285 $xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')'; 286 $xport = '(\d*)'; 287 $xauthority = '((' . $xuserinfo . '@)?' . $xhost . 288 '?(:' . $xport . ')?)'; 289 290 // Path from RFC3986. Blend absolute & relative for efficiency. 291 $xslash_seg = '(/[' . $xpchar . ']*)'; 292 $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))'; 293 $xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)'; 294 $xpath_abs = '(/(' . $xpath_rel . ')?)'; 295 $xapath = '(' . $xpath_authabs . '|' . $xpath_abs . 296 '|' . $xpath_rel . ')'; 297 298 // Query and fragment from RFC3986. 299 $xqueryfrag = '([' . $xpchar . '/?' . ']*)'; 300 301 // URL. 302 $xurl = '^(' . $xscheme . ':)?' . $xapath . '?' . 303 '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$'; 304 305 306 // Split the URL into components. 307 if ( !preg_match( '!' . $xurl . '!', $url, $m ) ) 308 return FALSE; 309 310 if ( !empty($m[2]) ) $parts['scheme'] = strtolower($m[2]); 311 312 if ( !empty($m[7]) ) { 313 if ( isset( $m[9] ) ) $parts['user'] = $m[9]; 314 else $parts['user'] = ''; 315 } 316 if ( !empty($m[10]) ) $parts['pass'] = $m[11]; 317 318 if ( !empty($m[13]) ) $h=$parts['host'] = $m[13]; 319 else if ( !empty($m[14]) ) $parts['host'] = $m[14]; 320 else if ( !empty($m[16]) ) $parts['host'] = $m[16]; 321 else if ( !empty( $m[5] ) ) $parts['host'] = ''; 322 if ( !empty($m[17]) ) $parts['port'] = $m[18]; 323 324 if ( !empty($m[19]) ) $parts['path'] = $m[19]; 325 else if ( !empty($m[21]) ) $parts['path'] = $m[21]; 326 else if ( !empty($m[25]) ) $parts['path'] = $m[25]; 327 328 if ( !empty($m[27]) ) $parts['query'] = $m[28]; 329 if ( !empty($m[29]) ) $parts['fragment']= $m[30]; 330 331 if ( !$decode ) 332 return $parts; 333 if ( !empty($parts['user']) ) 334 $parts['user'] = rawurldecode( $parts['user'] ); 335 if ( !empty($parts['pass']) ) 336 $parts['pass'] = rawurldecode( $parts['pass'] ); 337 if ( !empty($parts['path']) ) 338 $parts['path'] = rawurldecode( $parts['path'] ); 339 if ( isset($h) ) 340 $parts['host'] = rawurldecode( $parts['host'] ); 341 if ( !empty($parts['query']) ) 342 $parts['query'] = rawurldecode( $parts['query'] ); 343 if ( !empty($parts['fragment']) ) 344 $parts['fragment'] = rawurldecode( $parts['fragment'] ); 345 return $parts; 346 } 347 348 /** 349 * This function joins together URL components to form a complete URL. 350 * 351 * RFC3986 specifies the components of a Uniform Resource Identifier (URI). 352 * This function implements the specification's "component recomposition" 353 * algorithm for combining URI components into a full URI string. 354 * 355 * The $parts argument is an associative array containing zero or 356 * more of the following: 357 * 358 * "scheme" The scheme, such as "http". 359 * "host" The host name, IPv4, or IPv6 address. 360 * "port" The port number. 361 * "user" The user name. 362 * "pass" The user password. 363 * "path" The path, such as a file path for "http". 364 * "query" The query. 365 * "fragment" The fragment. 366 * 367 * The "port", "user", and "pass" values are only used when a "host" 368 * is present. 369 * 370 * The optional $encode argument indicates if appropriate URL components 371 * should be percent-encoded as they are assembled into the URL. Encoding 372 * is only applied to the "user", "pass", "host" (if a host name, not an 373 * IP address), "path", "query", and "fragment" components. The "scheme" 374 * and "port" are never encoded. When a "scheme" and "host" are both 375 * present, the "path" is presumed to be hierarchical and encoding 376 * processes each segment of the hierarchy separately (i.e., the slashes 377 * are left alone). 378 * 379 * The assembled URL string is returned. 380 * 381 * Parameters: 382 * parts an associative array of strings containing the 383 * individual parts of a URL. 384 * 385 * encode an optional boolean flag selecting whether 386 * to do percent encoding or not. Default = true. 387 * 388 * Return values: 389 * Returns the assembled URL string. The string is an absolute 390 * URL if a scheme is supplied, and a relative URL if not. An 391 * empty string is returned if the $parts array does not contain 392 * any of the needed values. 393 */ 394 function join_url( $parts, $encode=FALSE) 395 { 396 if ( $encode ) 397 { 398 if ( isset( $parts['user'] ) ) 399 $parts['user'] = rawurlencode( $parts['user'] ); 400 if ( isset( $parts['pass'] ) ) 401 $parts['pass'] = rawurlencode( $parts['pass'] ); 402 if ( isset( $parts['host'] ) && 403 !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) ) 404 $parts['host'] = rawurlencode( $parts['host'] ); 405 if ( !empty( $parts['path'] ) ) 406 $parts['path'] = preg_replace( '!%2F!ui', '/', 407 rawurlencode( $parts['path'] ) ); 408 if ( isset( $parts['query'] ) ) 409 $parts['query'] = rawurlencode( $parts['query'] ); 410 if ( isset( $parts['fragment'] ) ) 411 $parts['fragment'] = rawurlencode( $parts['fragment'] ); 412 } 413 414 $url = ''; 415 if ( !empty( $parts['scheme'] ) ) 416 $url .= $parts['scheme'] . ':'; 417 if ( isset( $parts['host'] ) ) 418 { 419 $url .= '//'; 420 if ( isset( $parts['user'] ) ) 421 { 422 $url .= $parts['user']; 423 if ( isset( $parts['pass'] ) ) 424 $url .= ':' . $parts['pass']; 425 $url .= '@'; 426 } 427 if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) ) 428 $url .= '[' . $parts['host'] . ']'; // IPv6 429 else 430 $url .= $parts['host']; // IPv4 or name 431 if ( isset( $parts['port'] ) ) 432 $url .= ':' . $parts['port']; 433 if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' ) 434 $url .= '/'; 435 } 436 if ( !empty( $parts['path'] ) ) 437 $url .= $parts['path']; 438 if ( isset( $parts['query'] ) ) 439 $url .= '?' . $parts['query']; 440 if ( isset( $parts['fragment'] ) ) 441 $url .= '#' . $parts['fragment']; 442 return $url; 443 } 444 445 /** 446 * This function encodes URL to form a URL which is properly 447 * percent encoded to replace disallowed characters. 448 * 449 * RFC3986 specifies the allowed characters in the URL as well as 450 * reserved characters in the URL. This function replaces all the 451 * disallowed characters in the URL with their repective percent 452 * encodings. Already encoded characters are not encoded again, 453 * such as '%20' is not encoded to '%2520'. 454 * 455 * Parameters: 456 * url the url to encode. 457 * 458 * Return values: 459 * Returns the encoded URL string. 460 */ 461 function encode_url($url) { 462 $reserved = array( 463 ":" => '!%3A!ui', 464 "/" => '!%2F!ui', 465 "?" => '!%3F!ui', 466 "#" => '!%23!ui', 467 "[" => '!%5B!ui', 468 "]" => '!%5D!ui', 469 "@" => '!%40!ui', 470 "!" => '!%21!ui', 471 "$" => '!%24!ui', 472 "&" => '!%26!ui', 473 "'" => '!%27!ui', 474 "(" => '!%28!ui', 475 ")" => '!%29!ui', 476 "*" => '!%2A!ui', 477 "+" => '!%2B!ui', 478 "," => '!%2C!ui', 479 ";" => '!%3B!ui', 480 "=" => '!%3D!ui', 481 "%" => '!%25!ui', 482 ); 483 484 $url = rawurlencode($url); 485 $url = preg_replace(array_values($reserved), array_keys($reserved), $url); 486 return $url; 487 } 488 489 /** 490 * Extract URLs from a web page. 491 * 492 * URLs are extracted from a long list of tags and attributes as defined 493 * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications. 494 * URLs are also extracted from tags and attributes that are common 495 * extensions of HTML, from the draft Forms 2.0 specification, from XHTML, 496 * and from WML 1.3 and 2.0. 497 * 498 * The function returns an associative array of associative arrays of 499 * arrays of URLs. The outermost array's keys are the tag (element) name, 500 * such as "a" for <a> or "img" for <img>. The values for these entries 501 * are associative arrays where the keys are attribute names for those 502 * tags, such as "href" for <a href="...">. Finally, the values for 503 * those arrays are URLs found in those tags and attributes throughout 504 * the text. 505 * 506 * Parameters: 507 * text the UTF-8 text to scan 508 * 509 * Return values: 510 * an associative array where keys are tags and values are an 511 * associative array where keys are attributes and values are 512 * an array of URLs. 513 * 514 * See: 515 * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page 516 */ 517 function extract_html_urls( $text ) 518 { 519 $match_elements = array( 520 // HTML 521 array('element'=>'a', 'attribute'=>'href'), // 2.0 522 array('element'=>'a', 'attribute'=>'urn'), // 2.0 523 array('element'=>'base', 'attribute'=>'href'), // 2.0 524 array('element'=>'form', 'attribute'=>'action'), // 2.0 525 array('element'=>'img', 'attribute'=>'src'), // 2.0 526 array('element'=>'link', 'attribute'=>'href'), // 2.0 527 528 array('element'=>'applet', 'attribute'=>'code'), // 3.2 529 array('element'=>'applet', 'attribute'=>'codebase'), // 3.2 530 array('element'=>'area', 'attribute'=>'href'), // 3.2 531 array('element'=>'body', 'attribute'=>'background'), // 3.2 532 array('element'=>'img', 'attribute'=>'usemap'), // 3.2 533 array('element'=>'input', 'attribute'=>'src'), // 3.2 534 535 array('element'=>'applet', 'attribute'=>'archive'), // 4.01 536 array('element'=>'applet', 'attribute'=>'object'), // 4.01 537 array('element'=>'blockquote', 'attribute'=>'cite'), // 4.01 538 array('element'=>'del', 'attribute'=>'cite'), // 4.01 539 array('element'=>'frame', 'attribute'=>'longdesc'), // 4.01 540 array('element'=>'frame', 'attribute'=>'src'), // 4.01 541 array('element'=>'head', 'attribute'=>'profile'), // 4.01 542 array('element'=>'iframe', 'attribute'=>'longdesc'), // 4.01 543 array('element'=>'iframe', 'attribute'=>'src'), // 4.01 544 array('element'=>'img', 'attribute'=>'longdesc'), // 4.01 545 array('element'=>'input', 'attribute'=>'usemap'), // 4.01 546 array('element'=>'ins', 'attribute'=>'cite'), // 4.01 547 array('element'=>'object', 'attribute'=>'archive'), // 4.01 548 array('element'=>'object', 'attribute'=>'classid'), // 4.01 549 array('element'=>'object', 'attribute'=>'codebase'), // 4.01 550 array('element'=>'object', 'attribute'=>'data'), // 4.01 551 array('element'=>'object', 'attribute'=>'usemap'), // 4.01 552 array('element'=>'q', 'attribute'=>'cite'), // 4.01 553 array('element'=>'script', 'attribute'=>'src'), // 4.01 554 555 array('element'=>'audio', 'attribute'=>'src'), // 5.0 556 array('element'=>'command', 'attribute'=>'icon'), // 5.0 557 array('element'=>'embed', 'attribute'=>'src'), // 5.0 558 array('element'=>'event-source','attribute'=>'src'), // 5.0 559 array('element'=>'html', 'attribute'=>'manifest'), // 5.0 560 array('element'=>'source', 'attribute'=>'src'), // 5.0 561 array('element'=>'video', 'attribute'=>'src'), // 5.0 562 array('element'=>'video', 'attribute'=>'poster'), // 5.0 563 564 array('element'=>'bgsound', 'attribute'=>'src'), // Extension 565 array('element'=>'body', 'attribute'=>'credits'), // Extension 566 array('element'=>'body', 'attribute'=>'instructions'), // Extension 567 array('element'=>'body', 'attribute'=>'logo'), // Extension 568 array('element'=>'div', 'attribute'=>'href'), // Extension 569 array('element'=>'div', 'attribute'=>'src'), // Extension 570 array('element'=>'embed', 'attribute'=>'code'), // Extension 571 array('element'=>'embed', 'attribute'=>'pluginspage'), // Extension 572 array('element'=>'html', 'attribute'=>'background'), // Extension 573 array('element'=>'ilayer', 'attribute'=>'src'), // Extension 574 array('element'=>'img', 'attribute'=>'dynsrc'), // Extension 575 array('element'=>'img', 'attribute'=>'lowsrc'), // Extension 576 array('element'=>'input', 'attribute'=>'dynsrc'), // Extension 577 array('element'=>'input', 'attribute'=>'lowsrc'), // Extension 578 array('element'=>'table', 'attribute'=>'background'), // Extension 579 array('element'=>'td', 'attribute'=>'background'), // Extension 580 array('element'=>'th', 'attribute'=>'background'), // Extension 581 array('element'=>'layer', 'attribute'=>'src'), // Extension 582 array('element'=>'xml', 'attribute'=>'src'), // Extension 583 584 array('element'=>'button', 'attribute'=>'action'), // Forms 2.0 585 array('element'=>'datalist', 'attribute'=>'data'), // Forms 2.0 586 array('element'=>'form', 'attribute'=>'data'), // Forms 2.0 587 array('element'=>'input', 'attribute'=>'action'), // Forms 2.0 588 array('element'=>'select', 'attribute'=>'data'), // Forms 2.0 589 590 // XHTML 591 array('element'=>'html', 'attribute'=>'xmlns'), 592 593 // WML 594 array('element'=>'access', 'attribute'=>'path'), // 1.3 595 array('element'=>'card', 'attribute'=>'onenterforward'), // 1.3 596 array('element'=>'card', 'attribute'=>'onenterbackward'),// 1.3 597 array('element'=>'card', 'attribute'=>'ontimer'), // 1.3 598 array('element'=>'go', 'attribute'=>'href'), // 1.3 599 array('element'=>'option', 'attribute'=>'onpick'), // 1.3 600 array('element'=>'template', 'attribute'=>'onenterforward'), // 1.3 601 array('element'=>'template', 'attribute'=>'onenterbackward'),// 1.3 602 array('element'=>'template', 'attribute'=>'ontimer'), // 1.3 603 array('element'=>'wml', 'attribute'=>'xmlns'), // 2.0 604 ); 605 606 $match_metas = array( 607 'content-base', 608 'content-location', 609 'referer', 610 'location', 611 'refresh', 612 ); 613 614 // Extract all elements 615 if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) ) 616 return array( ); 617 $elements = $matches[1]; 618 $value_pattern = '=(("([^"]*)")|([^\s]*))'; 619 620 // Match elements and attributes 621 foreach ( $match_elements as $match_element ) 622 { 623 $name = $match_element['element']; 624 $attr = $match_element['attribute']; 625 $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu'; 626 if ( $name == 'object' ) 627 $split_pattern = '/\s*/u'; // Space-separated URL list 628 else if ( $name == 'archive' ) 629 $split_pattern = '/,\s*/u'; // Comma-separated URL list 630 else 631 unset( $split_pattern ); // Single URL 632 foreach ( $elements as $element ) 633 { 634 if ( !preg_match( $pattern, $element, $match ) ) 635 continue; 636 $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3]; 637 if ( !isset( $split_pattern ) ) 638 $urls[$name][$attr][] = $m; 639 else 640 { 641 $msplit = preg_split( $split_pattern, $m ); 642 foreach ( $msplit as $ms ) 643 $urls[$name][$attr][] = $ms; 644 } 645 } 646 } 647 648 // Match meta http-equiv elements 649 foreach ( $match_metas as $match_meta ) 650 { 651 $attr_pattern = '/http-equiv="?' . $match_meta . '"?/iu'; 652 $content_pattern = '/content' . $value_pattern . '/iu'; 653 $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu'; 654 foreach ( $elements as $element ) 655 { 656 if ( !preg_match( '/^meta/iu', $element ) || 657 !preg_match( $attr_pattern, $element ) || 658 !preg_match( $content_pattern, $element, $match ) ) 659 continue; 660 $m = empty($match[3]) ? $match[4] : $match[3]; 661 if ( $match_meta != 'refresh' ) 662 $urls['meta']['http-equiv'][] = $m; 663 else if ( preg_match( $refresh_pattern, $m, $match ) ) 664 $urls['meta']['http-equiv'][] = $match[2]; 665 } 666 } 667 668 // Match style attributes 669 $urls['style'] = array( ); 670 $style_pattern = '/style' . $value_pattern . '/iu'; 671 foreach ( $elements as $element ) 672 { 673 if ( !preg_match( $style_pattern, $element, $match ) ) 674 continue; 675 $m = empty($match[3]) ? $match[4] : $match[3]; 676 $style_urls = extract_css_urls( $m ); 677 if ( !empty( $style_urls ) ) 678 $urls['style'] = array_merge_recursive( 679 $urls['style'], $style_urls ); 680 } 681 682 // Match style bodies 683 if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) ) 684 { 685 foreach ( $style_bodies[1] as $style_body ) 686 { 687 $style_urls = extract_css_urls( $style_body ); 688 if ( !empty( $style_urls ) ) 689 $urls['style'] = array_merge_recursive( 690 $urls['style'], $style_urls ); 691 } 692 } 693 if ( empty($urls['style']) ) 694 unset( $urls['style'] ); 695 696 return $urls; 697 } 698 /** 699 * Extract URLs from UTF-8 CSS text. 700 * 701 * URLs within @import statements and url() property functions are extracted 702 * and returned in an associative array of arrays. Array keys indicate 703 * the use context for the URL, including: 704 * 705 * "import" 706 * "property" 707 * 708 * Each value in the associative array is an array of URLs. 709 * 710 * Parameters: 711 * text the UTF-8 text to scan 712 * 713 * Return values: 714 * an associative array of arrays of URLs. 715 * 716 * See: 717 * http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file 718 */ 719 function extract_css_urls( $text ) 720 { 721 $urls = array( ); 722 723 $url_pattern = '(([^\\\\\'", \(\)]*(\\\\.)?)+)'; 724 $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)'; 725 $pattern = '/(' . 726 '(@import\s*[\'"]' . $url_pattern . '[\'"])' . 727 '|(@import\s*' . $urlfunc_pattern . ')' . 728 '|(' . $urlfunc_pattern . ')' . ')/iu'; 729 if ( !preg_match_all( $pattern, $text, $matches ) ) 730 return $urls; 731 732 // @import '...' 733 // @import "..." 734 foreach ( $matches[3] as $match ) 735 if ( !empty($match) ) 736 $urls['import'][] = 737 preg_replace( '/\\\\(.)/u', '\\1', $match ); 738 739 // @import url(...) 740 // @import url('...') 741 // @import url("...") 742 foreach ( $matches[7] as $match ) 743 if ( !empty($match) ) 744 $urls['import'][] = 745 preg_replace( '/\\\\(.)/u', '\\1', $match ); 746 747 // url(...) 748 // url('...') 749 // url("...") 750 foreach ( $matches[11] as $match ) 751 if ( !empty($match) ) 752 $urls['property'][] = 753 preg_replace( '/\\\\(.)/u', '\\1', $match ); 754 755 return $urls; 756 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body