Search moodle.org's
Developer Documentation

See Release Notes

  • Bug fixes for general core bugs in 4.0.x will end 8 May 2023 (12 months).
  • Bug fixes for security issues in 4.0.x will end 13 November 2023 (18 months).
  • PHP version: minimum PHP 7.3.0 Note: the minimum PHP version has increased since Moodle 3.10. PHP 7.4.x is also supported.
   1  <?php
   2  
   3  /**
   4   * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
   5   * All rights reserved.
   6   *
   7   * Redistribution and use in source and binary forms, with or without
   8   * modification, are permitted provided that the following conditions
   9   * are met:
  10   *
  11   *	 * Redistributions of source code must retain the above copyright
  12   *	   notice, this list of conditions and the following disclaimer.
  13   *
  14   *	 * Redistributions in binary form must reproduce the above
  15   *	   copyright notice, this list of conditions and the following
  16   *	   disclaimer in the documentation and/or other materials provided
  17   *	   with the distribution.
  18   *
  19   *	 * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
  20   *	   the names of its contributors may be used to endorse or promote
  21   *	   products derived from this software without specific prior
  22   *	   written permission.
  23   *
  24   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  27   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  28   * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34   * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
  35   * OF SUCH DAMAGE.
  36   */
  37  
  38  /*
  39   * This is a BSD License approved by the Open Source Initiative (OSI).
  40   * See:  http://www.opensource.org/licenses/bsd-license.php
  41   */
  42  
  43  defined('MOODLE_INTERNAL') || die();
  44  
  45  /**
  46   * Combine a base URL and a relative URL to produce a new
  47   * absolute URL.  The base URL is often the URL of a page,
  48   * and the relative URL is a URL embedded on that page.
  49   *
  50   * This function implements the "absolutize" algorithm from
  51   * the RFC3986 specification for URLs.
  52   *
  53   * This function supports multi-byte characters with the UTF-8 encoding,
  54   * per the URL specification.
  55   *
  56   * Parameters:
  57   * 	 baseUrl	 	 the absolute base URL.
  58   *
  59   * 	 url	 	 the relative URL to convert.
  60   *
  61   * Return values:
  62   * 	 An absolute URL that combines parts of the base and relative
  63   * 	 URLs, or FALSE if the base URL is not absolute or if either
  64   * 	 URL cannot be parsed.
  65   */
  66  function url_to_absolute( $baseUrl, $relativeUrl )
  67  {
  68  	 // If relative URL has a scheme, clean path and return.
  69  	 $r = split_url( $relativeUrl );
  70  	 if ( $r === FALSE )
  71  	 	 return FALSE;
  72  	 if ( !empty( $r['scheme'] ) )
  73  	 {
  74  	 	 if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
  75  	 	 	 $r['path'] = url_remove_dot_segments( $r['path'] );
  76  	 	 return join_url( $r );
  77  	 }
  78  
  79  	 // Make sure the base URL is absolute.
  80  	 $b = split_url( $baseUrl );
  81  	 if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
  82  	 	 return FALSE;
  83  	 $r['scheme'] = $b['scheme'];
  84  	 if (empty($b['path'])) {
  85  	 	 $b['path'] = '';
  86  	 }
  87  
  88  	 // If relative URL has an authority, clean path and return.
  89  	 if ( isset( $r['host'] ) )
  90  	 {
  91  	 	 if ( !empty( $r['path'] ) )
  92  	 	 	 $r['path'] = url_remove_dot_segments( $r['path'] );
  93  	 	 return join_url( $r );
  94  	 }
  95  	 unset( $r['port'] );
  96  	 unset( $r['user'] );
  97  	 unset( $r['pass'] );
  98  
  99  	 // Copy base authority.
 100  	 $r['host'] = $b['host'];
 101  	 if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
 102  	 if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
 103  	 if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
 104  
 105  	 // If relative URL has no path, use base path
 106  	 if ( empty( $r['path'] ) )
 107  	 {
 108  	 	 if ( !empty( $b['path'] ) )
 109  	 	 	 $r['path'] = $b['path'];
 110  	 	 if ( !isset( $r['query'] ) && isset( $b['query'] ) )
 111  	 	 	 $r['query'] = $b['query'];
 112  	 	 return join_url( $r );
 113  	 }
 114  
 115  	 // If relative URL path doesn't start with /, merge with base path.
 116  	 if ($r['path'][0] != '/') {
 117  	 	 $base = core_text::strrchr($b['path'], '/', TRUE);
 118  	 	 if ($base === FALSE) {
 119  	 	 	 $base = '';
 120  	 	 }
 121  	 	 $r['path'] = $base . '/' . $r['path'];
 122  	 }
 123  	 $r['path'] = url_remove_dot_segments($r['path']);
 124  	 return join_url($r);
 125  }
 126  
 127  /**
 128   * Filter out "." and ".." segments from a URL's path and return
 129   * the result.
 130   *
 131   * This function implements the "remove_dot_segments" algorithm from
 132   * the RFC3986 specification for URLs.
 133   *
 134   * This function supports multi-byte characters with the UTF-8 encoding,
 135   * per the URL specification.
 136   *
 137   * Parameters:
 138   * 	 path	 the path to filter
 139   *
 140   * Return values:
 141   * 	 The filtered path with "." and ".." removed.
 142   */
 143  function url_remove_dot_segments( $path )
 144  {
 145  	 // multi-byte character explode
 146  	 $inSegs  = preg_split( '!/!u', $path );
 147  	 $outSegs = array( );
 148  	 foreach ( $inSegs as $seg )
 149  	 {
 150  	 	 if ( $seg == '' || $seg == '.')
 151  	 	 	 continue;
 152  	 	 if ( $seg == '..' )
 153  	 	 	 array_pop( $outSegs );
 154  	 	 else
 155  	 	 	 array_push( $outSegs, $seg );
 156  	 }
 157  	 $outPath = implode( '/', $outSegs );
 158  
 159  	 if ($path[0] == '/') {
 160  	 	 $outPath = '/' . $outPath;
 161  	 }
 162  
 163  	 // Compare last multi-byte character against '/'.
 164  	 if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
 165  	 	 $outPath .= '/';
 166  	 }
 167  	 return $outPath;
 168  }
 169  
 170  /**
 171   * This function parses an absolute or relative URL and splits it
 172   * into individual components.
 173   *
 174   * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 175   * A portion of the ABNFs are repeated here:
 176   *
 177   *	 URI-reference	 = URI
 178   *	 	 	 / relative-ref
 179   *
 180   *	 URI	 	 = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 181   *
 182   *	 relative-ref	 = relative-part [ "?" query ] [ "#" fragment ]
 183   *
 184   *	 hier-part	 = "//" authority path-abempty
 185   *	 	 	 / path-absolute
 186   *	 	 	 / path-rootless
 187   *	 	 	 / path-empty
 188   *
 189   *	 relative-part	 = "//" authority path-abempty
 190   *	 	 	 / path-absolute
 191   *	 	 	 / path-noscheme
 192   *	 	 	 / path-empty
 193   *
 194   *	 authority	 = [ userinfo "@" ] host [ ":" port ]
 195   *
 196   * So, a URL has the following major components:
 197   *
 198   *	 scheme
 199   *	 	 The name of a method used to interpret the rest of
 200   *	 	 the URL.  Examples:  "http", "https", "mailto", "file'.
 201   *
 202   *	 authority
 203   *	 	 The name of the authority governing the URL's name
 204   *	 	 space.  Examples:  "example.com", "user@example.com",
 205   *	 	 "example.com:80", "user:password@example.com:80".
 206   *
 207   *	 	 The authority may include a host name, port number,
 208   *	 	 user name, and password.
 209   *
 210   *	 	 The host may be a name, an IPv4 numeric address, or
 211   *	 	 an IPv6 numeric address.
 212   *
 213   *	 path
 214   *	 	 The hierarchical path to the URL's resource.
 215   *	 	 Examples:  "/index.htm", "/scripts/page.php".
 216   *
 217   *	 query
 218   *	 	 The data for a query.  Examples:  "?search=google.com".
 219   *
 220   *	 fragment
 221   *	 	 The name of a secondary resource relative to that named
 222   *	 	 by the path.  Examples:  "#section1", "#header".
 223   *
 224   * An "absolute" URL must include a scheme and path.  The authority, query,
 225   * and fragment components are optional.
 226   *
 227   * A "relative" URL does not include a scheme and must include a path.  The
 228   * authority, query, and fragment components are optional.
 229   *
 230   * This function splits the $url argument into the following components
 231   * and returns them in an associative array.  Keys to that array include:
 232   *
 233   *	 "scheme"	 The scheme, such as "http".
 234   *	 "host"	 	 The host name, IPv4, or IPv6 address.
 235   *	 "port"	 	 The port number.
 236   *	 "user"	 	 The user name.
 237   *	 "pass"	 	 The user password.
 238   *	 "path"	 	 The path, such as a file path for "http".
 239   *	 "query"	 	 The query.
 240   *	 "fragment"	 The fragment.
 241   *
 242   * One or more of these may not be present, depending upon the URL.
 243   *
 244   * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 245   * "path", "query", and "fragment" may have percent-encoded characters
 246   * decoded.  The "scheme" and "port" cannot include percent-encoded
 247   * characters and are never decoded.  Decoding occurs after the URL has
 248   * been parsed.
 249   *
 250   * Parameters:
 251   * 	 url	 	 the URL to parse.
 252   *
 253   * 	 decode	 	 an optional boolean flag selecting whether
 254   * 	 	 	 to decode percent encoding or not.  Default = TRUE.
 255   *
 256   * Return values:
 257   * 	 the associative array of URL parts, or FALSE if the URL is
 258   * 	 too malformed to recognize any parts.
 259   */
 260  function split_url( $url, $decode=FALSE)
 261  {
 262  	 // Character sets from RFC3986.
 263  	 $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
 264  	 $xpchar        = $xunressub . ':@% ';
 265  
 266  	 // Scheme from RFC3986.
 267  	 $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
 268  
 269  	 // User info (user + password) from RFC3986.
 270  	 $xuserinfo     = '((['  . $xunressub . '%]*)' .
 271  	                  '(:([' . $xunressub . ':%]*))?)';
 272  
 273  	 // IPv4 from RFC3986 (without digit constraints).
 274  	 $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
 275  
 276  	 // IPv6 from RFC2732 (without digit and grouping constraints).
 277  	 $xipv6         = '(\[([a-fA-F\d.:]+)\])';
 278  
 279  	 // Host name from RFC1035.  Technically, must start with a letter.
 280  	 // Relax that restriction to better parse URL structure, then
 281  	 // leave host name validation to application.
 282  	 $xhost_name    = '([a-zA-Z\d\-.%]+)';
 283  
 284  	 // Authority from RFC3986.  Skip IP future.
 285  	 $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
 286  	 $xport         = '(\d*)';
 287  	 $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
 288  	 	          '?(:' . $xport . ')?)';
 289  
 290  	 // Path from RFC3986.  Blend absolute & relative for efficiency.
 291  	 $xslash_seg    = '(/[' . $xpchar . ']*)';
 292  	 $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
 293  	 $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
 294  	 $xpath_abs     = '(/(' . $xpath_rel . ')?)';
 295  	 $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
 296  	 	 	  '|' . $xpath_rel . ')';
 297  
 298  	 // Query and fragment from RFC3986.
 299  	 $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
 300  
 301  	 // URL.
 302  	 $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
 303  	                  '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
 304  
 305  
 306  	 // Split the URL into components.
 307  	 if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
 308  	 	 return FALSE;
 309  
 310  	 if ( !empty($m[2]) )	 	 $parts['scheme']  = strtolower($m[2]);
 311  
 312  	 if ( !empty($m[7]) ) {
 313  	 	 if ( isset( $m[9] ) )	 $parts['user']    = $m[9];
 314  	 	 else	 	 	 $parts['user']    = '';
 315  	 }
 316  	 if ( !empty($m[10]) )	 	 $parts['pass']    = $m[11];
 317  
 318  	 if ( !empty($m[13]) )	 	 $h=$parts['host'] = $m[13];
 319  	 else if ( !empty($m[14]) )	 $parts['host']    = $m[14];
 320  	 else if ( !empty($m[16]) )	 $parts['host']    = $m[16];
 321  	 else if ( !empty( $m[5] ) )	 $parts['host']    = '';
 322  	 if ( !empty($m[17]) )	 	 $parts['port']    = $m[18];
 323  
 324  	 if ( !empty($m[19]) )	 	 $parts['path']    = $m[19];
 325  	 else if ( !empty($m[21]) )	 $parts['path']    = $m[21];
 326  	 else if ( !empty($m[25]) )	 $parts['path']    = $m[25];
 327  
 328  	 if ( !empty($m[27]) )	 	 $parts['query']   = $m[28];
 329  	 if ( !empty($m[29]) )	 	 $parts['fragment']= $m[30];
 330  
 331  	 if ( !$decode )
 332  	 	 return $parts;
 333  	 if ( !empty($parts['user']) )
 334  	 	 $parts['user']     = rawurldecode( $parts['user'] );
 335  	 if ( !empty($parts['pass']) )
 336  	 	 $parts['pass']     = rawurldecode( $parts['pass'] );
 337  	 if ( !empty($parts['path']) )
 338  	 	 $parts['path']     = rawurldecode( $parts['path'] );
 339  	 if ( isset($h) )
 340  	 	 $parts['host']     = rawurldecode( $parts['host'] );
 341  	 if ( !empty($parts['query']) )
 342  	 	 $parts['query']    = rawurldecode( $parts['query'] );
 343  	 if ( !empty($parts['fragment']) )
 344  	 	 $parts['fragment'] = rawurldecode( $parts['fragment'] );
 345  	 return $parts;
 346  }
 347  
 348  /**
 349   * This function joins together URL components to form a complete URL.
 350   *
 351   * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 352   * This function implements the specification's "component recomposition"
 353   * algorithm for combining URI components into a full URI string.
 354   *
 355   * The $parts argument is an associative array containing zero or
 356   * more of the following:
 357   *
 358   *	 "scheme"	 The scheme, such as "http".
 359   *	 "host"	 	 The host name, IPv4, or IPv6 address.
 360   *	 "port"	 	 The port number.
 361   *	 "user"	 	 The user name.
 362   *	 "pass"	 	 The user password.
 363   *	 "path"	 	 The path, such as a file path for "http".
 364   *	 "query"	 	 The query.
 365   *	 "fragment"	 The fragment.
 366   *
 367   * The "port", "user", and "pass" values are only used when a "host"
 368   * is present.
 369   *
 370   * The optional $encode argument indicates if appropriate URL components
 371   * should be percent-encoded as they are assembled into the URL.  Encoding
 372   * is only applied to the "user", "pass", "host" (if a host name, not an
 373   * IP address), "path", "query", and "fragment" components.  The "scheme"
 374   * and "port" are never encoded.  When a "scheme" and "host" are both
 375   * present, the "path" is presumed to be hierarchical and encoding
 376   * processes each segment of the hierarchy separately (i.e., the slashes
 377   * are left alone).
 378   *
 379   * The assembled URL string is returned.
 380   *
 381   * Parameters:
 382   * 	 parts	 	 an associative array of strings containing the
 383   * 	 	 	 individual parts of a URL.
 384   *
 385   * 	 encode	 	 an optional boolean flag selecting whether
 386   * 	 	 	 to do percent encoding or not.  Default = true.
 387   *
 388   * Return values:
 389   * 	 Returns the assembled URL string.  The string is an absolute
 390   * 	 URL if a scheme is supplied, and a relative URL if not.  An
 391   * 	 empty string is returned if the $parts array does not contain
 392   * 	 any of the needed values.
 393   */
 394  function join_url( $parts, $encode=FALSE)
 395  {
 396  	 if ( $encode )
 397  	 {
 398  	 	 if ( isset( $parts['user'] ) )
 399  	 	 	 $parts['user']     = rawurlencode( $parts['user'] );
 400  	 	 if ( isset( $parts['pass'] ) )
 401  	 	 	 $parts['pass']     = rawurlencode( $parts['pass'] );
 402  	 	 if ( isset( $parts['host'] ) &&
 403  	 	 	 !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
 404  	 	 	 $parts['host']     = rawurlencode( $parts['host'] );
 405  	 	 if ( !empty( $parts['path'] ) )
 406  	 	 	 $parts['path']     = preg_replace( '!%2F!ui', '/',
 407  	 	 	 	 rawurlencode( $parts['path'] ) );
 408  	 	 if ( isset( $parts['query'] ) )
 409  	 	 	 $parts['query']    = rawurlencode( $parts['query'] );
 410  	 	 if ( isset( $parts['fragment'] ) )
 411  	 	 	 $parts['fragment'] = rawurlencode( $parts['fragment'] );
 412  	 }
 413  
 414  	 $url = '';
 415  	 if ( !empty( $parts['scheme'] ) )
 416  	 	 $url .= $parts['scheme'] . ':';
 417  	 if ( isset( $parts['host'] ) )
 418  	 {
 419  	 	 $url .= '//';
 420  	 	 if ( isset( $parts['user'] ) )
 421  	 	 {
 422  	 	 	 $url .= $parts['user'];
 423  	 	 	 if ( isset( $parts['pass'] ) )
 424  	 	 	 	 $url .= ':' . $parts['pass'];
 425  	 	 	 $url .= '@';
 426  	 	 }
 427  	 	 if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
 428  	 	 	 $url .= '[' . $parts['host'] . ']';	 // IPv6
 429  	 	 else
 430  	 	 	 $url .= $parts['host'];	 	 	 // IPv4 or name
 431  	 	 if ( isset( $parts['port'] ) )
 432  	 	 	 $url .= ':' . $parts['port'];
 433  	 	 if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
 434  	 	 	 $url .= '/';
 435  	 }
 436  	 if ( !empty( $parts['path'] ) )
 437  	 	 $url .= $parts['path'];
 438  	 if ( isset( $parts['query'] ) )
 439  	 	 $url .= '?' . $parts['query'];
 440  	 if ( isset( $parts['fragment'] ) )
 441  	 	 $url .= '#' . $parts['fragment'];
 442  	 return $url;
 443  }
 444  
 445  /**
 446   * This function encodes URL to form a URL which is properly
 447   * percent encoded to replace disallowed characters.
 448   *
 449   * RFC3986 specifies the allowed characters in the URL as well as
 450   * reserved characters in the URL. This function replaces all the
 451   * disallowed characters in the URL with their repective percent
 452   * encodings. Already encoded characters are not encoded again,
 453   * such as '%20' is not encoded to '%2520'.
 454   *
 455   * Parameters:
 456   * 	 url	 	 the url to encode.
 457   *
 458   * Return values:
 459   * 	 Returns the encoded URL string.
 460   */
 461  function encode_url($url) {
 462    $reserved = array(
 463      ":" => '!%3A!ui',
 464      "/" => '!%2F!ui',
 465      "?" => '!%3F!ui',
 466      "#" => '!%23!ui',
 467      "[" => '!%5B!ui',
 468      "]" => '!%5D!ui',
 469      "@" => '!%40!ui',
 470      "!" => '!%21!ui',
 471      "$" => '!%24!ui',
 472      "&" => '!%26!ui',
 473      "'" => '!%27!ui',
 474      "(" => '!%28!ui',
 475      ")" => '!%29!ui',
 476      "*" => '!%2A!ui',
 477      "+" => '!%2B!ui',
 478      "," => '!%2C!ui',
 479      ";" => '!%3B!ui',
 480      "=" => '!%3D!ui',
 481      "%" => '!%25!ui',
 482    );
 483  
 484    $url = rawurlencode($url);
 485    $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
 486    return $url;
 487  }
 488  
 489  /**
 490   * Extract URLs from a web page.
 491   *
 492   * URLs are extracted from a long list of tags and attributes as defined
 493   * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
 494   * URLs are also extracted from tags and attributes that are common
 495   * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
 496   * and from WML 1.3 and 2.0.
 497   *
 498   * The function returns an associative array of associative arrays of
 499   * arrays of URLs.  The outermost array's keys are the tag (element) name,
 500   * such as "a" for <a> or "img" for <img>.  The values for these entries
 501   * are associative arrays where the keys are attribute names for those
 502   * tags, such as "href" for <a href="...">.  Finally, the values for
 503   * those arrays are URLs found in those tags and attributes throughout
 504   * the text.
 505   *
 506   * Parameters:
 507   * 	 text	 	 the UTF-8 text to scan
 508   *
 509   * Return values:
 510   * 	 an associative array where keys are tags and values are an
 511   * 	 associative array where keys are attributes and values are
 512   * 	 an array of URLs.
 513   *
 514   * See:
 515   * 	 http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
 516   */
 517  function extract_html_urls( $text )
 518  {
 519  	 $match_elements = array(
 520  	 	 // HTML
 521  	 	 array('element'=>'a',	 	 'attribute'=>'href'),	 	 // 2.0
 522  	 	 array('element'=>'a',	 	 'attribute'=>'urn'),	 	 // 2.0
 523  	 	 array('element'=>'base',	 'attribute'=>'href'),	 	 // 2.0
 524  	 	 array('element'=>'form',	 'attribute'=>'action'),	 	 // 2.0
 525  	 	 array('element'=>'img',	 	 'attribute'=>'src'),	 	 // 2.0
 526  	 	 array('element'=>'link',	 'attribute'=>'href'),	 	 // 2.0
 527  
 528  	 	 array('element'=>'applet',	 'attribute'=>'code'),	 	 // 3.2
 529  	 	 array('element'=>'applet',	 'attribute'=>'codebase'),	 // 3.2
 530  	 	 array('element'=>'area',	 'attribute'=>'href'),	 	 // 3.2
 531  	 	 array('element'=>'body',	 'attribute'=>'background'),	 // 3.2
 532  	 	 array('element'=>'img',	 	 'attribute'=>'usemap'),	 	 // 3.2
 533  	 	 array('element'=>'input',	 'attribute'=>'src'),	 	 // 3.2
 534  
 535  	 	 array('element'=>'applet',	 'attribute'=>'archive'),	 // 4.01
 536  	 	 array('element'=>'applet',	 'attribute'=>'object'),	 	 // 4.01
 537  	 	 array('element'=>'blockquote',	 'attribute'=>'cite'),	 	 // 4.01
 538  	 	 array('element'=>'del',	 	 'attribute'=>'cite'),	 	 // 4.01
 539  	 	 array('element'=>'frame',	 'attribute'=>'longdesc'),	 // 4.01
 540  	 	 array('element'=>'frame',	 'attribute'=>'src'),	 	 // 4.01
 541  	 	 array('element'=>'head',	 'attribute'=>'profile'),	 // 4.01
 542  	 	 array('element'=>'iframe',	 'attribute'=>'longdesc'),	 // 4.01
 543  	 	 array('element'=>'iframe',	 'attribute'=>'src'),	 	 // 4.01
 544  	 	 array('element'=>'img',	 	 'attribute'=>'longdesc'),	 // 4.01
 545  	 	 array('element'=>'input',	 'attribute'=>'usemap'),	 	 // 4.01
 546  	 	 array('element'=>'ins',	 	 'attribute'=>'cite'),	 	 // 4.01
 547  	 	 array('element'=>'object',	 'attribute'=>'archive'),	 // 4.01
 548  	 	 array('element'=>'object',	 'attribute'=>'classid'),	 // 4.01
 549  	 	 array('element'=>'object',	 'attribute'=>'codebase'),	 // 4.01
 550  	 	 array('element'=>'object',	 'attribute'=>'data'),	 	 // 4.01
 551  	 	 array('element'=>'object',	 'attribute'=>'usemap'),	 	 // 4.01
 552  	 	 array('element'=>'q',	 	 'attribute'=>'cite'),	 	 // 4.01
 553  	 	 array('element'=>'script',	 'attribute'=>'src'),	 	 // 4.01
 554  
 555  	 	 array('element'=>'audio',	 'attribute'=>'src'),	 	 // 5.0
 556  	 	 array('element'=>'command',	 'attribute'=>'icon'),	 	 // 5.0
 557  	 	 array('element'=>'embed',	 'attribute'=>'src'),	 	 // 5.0
 558  	 	 array('element'=>'event-source','attribute'=>'src'),	 	 // 5.0
 559  	 	 array('element'=>'html',	 'attribute'=>'manifest'),	 // 5.0
 560  	 	 array('element'=>'source',	 'attribute'=>'src'),	 	 // 5.0
 561  	 	 array('element'=>'video',	 'attribute'=>'src'),	 	 // 5.0
 562  	 	 array('element'=>'video',	 'attribute'=>'poster'),	 	 // 5.0
 563  
 564  	 	 array('element'=>'bgsound',	 'attribute'=>'src'),	 	 // Extension
 565  	 	 array('element'=>'body',	 'attribute'=>'credits'),	 // Extension
 566  	 	 array('element'=>'body',	 'attribute'=>'instructions'),	 // Extension
 567  	 	 array('element'=>'body',	 'attribute'=>'logo'),	 	 // Extension
 568  	 	 array('element'=>'div',	 	 'attribute'=>'href'),	 	 // Extension
 569  	 	 array('element'=>'div',	 	 'attribute'=>'src'),	 	 // Extension
 570  	 	 array('element'=>'embed',	 'attribute'=>'code'),	 	 // Extension
 571  	 	 array('element'=>'embed',	 'attribute'=>'pluginspage'),	 // Extension
 572  	 	 array('element'=>'html',	 'attribute'=>'background'),	 // Extension
 573  	 	 array('element'=>'ilayer',	 'attribute'=>'src'),	 	 // Extension
 574  	 	 array('element'=>'img',	 	 'attribute'=>'dynsrc'),	 	 // Extension
 575  	 	 array('element'=>'img',	 	 'attribute'=>'lowsrc'),	 	 // Extension
 576  	 	 array('element'=>'input',	 'attribute'=>'dynsrc'),	 	 // Extension
 577  	 	 array('element'=>'input',	 'attribute'=>'lowsrc'),	 	 // Extension
 578  	 	 array('element'=>'table',	 'attribute'=>'background'),	 // Extension
 579  	 	 array('element'=>'td',	 	 'attribute'=>'background'),	 // Extension
 580  	 	 array('element'=>'th',	 	 'attribute'=>'background'),	 // Extension
 581  	 	 array('element'=>'layer',	 'attribute'=>'src'),	 	 // Extension
 582  	 	 array('element'=>'xml',	 	 'attribute'=>'src'),	 	 // Extension
 583  
 584  	 	 array('element'=>'button',	 'attribute'=>'action'),	 	 // Forms 2.0
 585  	 	 array('element'=>'datalist',	 'attribute'=>'data'),	 	 // Forms 2.0
 586  	 	 array('element'=>'form',	 'attribute'=>'data'),	 	 // Forms 2.0
 587  	 	 array('element'=>'input',	 'attribute'=>'action'),	 	 // Forms 2.0
 588  	 	 array('element'=>'select',	 'attribute'=>'data'),	 	 // Forms 2.0
 589  
 590  	 	 // XHTML
 591  	 	 array('element'=>'html',	 'attribute'=>'xmlns'),
 592  
 593  	 	 // WML
 594  	 	 array('element'=>'access',	 'attribute'=>'path'),	 	 // 1.3
 595  	 	 array('element'=>'card',	 'attribute'=>'onenterforward'),	 // 1.3
 596  	 	 array('element'=>'card',	 'attribute'=>'onenterbackward'),// 1.3
 597  	 	 array('element'=>'card',	 'attribute'=>'ontimer'),	 // 1.3
 598  	 	 array('element'=>'go',	 	 'attribute'=>'href'),	 	 // 1.3
 599  	 	 array('element'=>'option',	 'attribute'=>'onpick'),	 	 // 1.3
 600  	 	 array('element'=>'template',	 'attribute'=>'onenterforward'),	 // 1.3
 601  	 	 array('element'=>'template',	 'attribute'=>'onenterbackward'),// 1.3
 602  	 	 array('element'=>'template',	 'attribute'=>'ontimer'),	 // 1.3
 603  	 	 array('element'=>'wml',	 	 'attribute'=>'xmlns'),	 	 // 2.0
 604  	 );
 605  
 606  	 $match_metas = array(
 607  	 	 'content-base',
 608  	 	 'content-location',
 609  	 	 'referer',
 610  	 	 'location',
 611  	 	 'refresh',
 612  	 );
 613  
 614  	 // Extract all elements
 615  	 if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
 616  	 	 return array( );
 617  	 $elements = $matches[1];
 618  	 $value_pattern = '=(("([^"]*)")|([^\s]*))';
 619  
 620  	 // Match elements and attributes
 621  	 foreach ( $match_elements as $match_element )
 622  	 {
 623  	 	 $name = $match_element['element'];
 624  	 	 $attr = $match_element['attribute'];
 625  	 	 $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
 626  	 	 if ( $name == 'object' )
 627  	 	 	 $split_pattern = '/\s*/u';	 // Space-separated URL list
 628  	 	 else if ( $name == 'archive' )
 629  	 	 	 $split_pattern = '/,\s*/u';	 // Comma-separated URL list
 630  	 	 else
 631  	 	 	 unset( $split_pattern );	 // Single URL
 632  	 	 foreach ( $elements as $element )
 633  	 	 {
 634  	 	 	 if ( !preg_match( $pattern, $element, $match ) )
 635  	 	 	 	 continue;
 636  	 	 	 $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
 637  	 	 	 if ( !isset( $split_pattern ) )
 638  	 	 	 	 $urls[$name][$attr][] = $m;
 639  	 	 	 else
 640  	 	 	 {
 641  	 	 	 	 $msplit = preg_split( $split_pattern, $m );
 642  	 	 	 	 foreach ( $msplit as $ms )
 643  	 	 	 	 	 $urls[$name][$attr][] = $ms;
 644  	 	 	 }
 645  	 	 }
 646  	 }
 647  
 648  	 // Match meta http-equiv elements
 649  	 foreach ( $match_metas as $match_meta )
 650  	 {
 651  	 	 $attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
 652  	 	 $content_pattern = '/content'  . $value_pattern . '/iu';
 653  	 	 $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
 654  	 	 foreach ( $elements as $element )
 655  	 	 {
 656  	 	 	 if ( !preg_match( '/^meta/iu', $element ) ||
 657  	 	 	 	 !preg_match( $attr_pattern, $element ) ||
 658  	 	 	 	 !preg_match( $content_pattern, $element, $match ) )
 659  	 	 	 	 continue;
 660  	 	 	 $m = empty($match[3]) ? $match[4] : $match[3];
 661  	 	 	 if ( $match_meta != 'refresh' )
 662  	 	 	 	 $urls['meta']['http-equiv'][] = $m;
 663  	 	 	 else if ( preg_match( $refresh_pattern, $m, $match ) )
 664  	 	 	 	 $urls['meta']['http-equiv'][] = $match[2];
 665  	 	 }
 666  	 }
 667  
 668  	 // Match style attributes
 669  	 $urls['style'] = array( );
 670  	 $style_pattern = '/style' . $value_pattern . '/iu';
 671  	 foreach ( $elements as $element )
 672  	 {
 673  	 	 if ( !preg_match( $style_pattern, $element, $match ) )
 674  	 	 	 continue;
 675  	 	 $m = empty($match[3]) ? $match[4] : $match[3];
 676  	 	 $style_urls = extract_css_urls( $m );
 677  	 	 if ( !empty( $style_urls ) )
 678  	 	 	 $urls['style'] = array_merge_recursive(
 679  	 	 	 	 $urls['style'], $style_urls );
 680  	 }
 681  
 682  	 // Match style bodies
 683  	 if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
 684  	 {
 685  	 	 foreach ( $style_bodies[1] as $style_body )
 686  	 	 {
 687  	 	 	 $style_urls = extract_css_urls( $style_body );
 688  	 	 	 if ( !empty( $style_urls ) )
 689  	 	 	 	 $urls['style'] = array_merge_recursive(
 690  	 	 	 	 	 $urls['style'], $style_urls );
 691  	 	 }
 692  	 }
 693  	 if ( empty($urls['style']) )
 694  	 	 unset( $urls['style'] );
 695  
 696  	 return $urls;
 697  }
 698  /**
 699   * Extract URLs from UTF-8 CSS text.
 700   *
 701   * URLs within @import statements and url() property functions are extracted
 702   * and returned in an associative array of arrays.  Array keys indicate
 703   * the use context for the URL, including:
 704   *
 705   * 	 "import"
 706   * 	 "property"
 707   *
 708   * Each value in the associative array is an array of URLs.
 709   *
 710   * Parameters:
 711   * 	 text	 	 the UTF-8 text to scan
 712   *
 713   * Return values:
 714   * 	 an associative array of arrays of URLs.
 715   *
 716   * See:
 717   * 	 http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
 718   */
 719  function extract_css_urls( $text )
 720  {
 721  	 $urls = array( );
 722  
 723  	 $url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
 724  	 $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
 725  	 $pattern         = '/(' .
 726  	 	  '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
 727  	 	 '|(@import\s*'      . $urlfunc_pattern . ')'      .
 728  	 	 '|('                . $urlfunc_pattern . ')'      .  ')/iu';
 729  	 if ( !preg_match_all( $pattern, $text, $matches ) )
 730  	 	 return $urls;
 731  
 732  	 // @import '...'
 733  	 // @import "..."
 734  	 foreach ( $matches[3] as $match )
 735  	 	 if ( !empty($match) )
 736  	 	 	 $urls['import'][] =
 737  	 	 	 	 preg_replace( '/\\\\(.)/u', '\\1', $match );
 738  
 739  	 // @import url(...)
 740  	 // @import url('...')
 741  	 // @import url("...")
 742  	 foreach ( $matches[7] as $match )
 743  	 	 if ( !empty($match) )
 744  	 	 	 $urls['import'][] =
 745  	 	 	 	 preg_replace( '/\\\\(.)/u', '\\1', $match );
 746  
 747  	 // url(...)
 748  	 // url('...')
 749  	 // url("...")
 750  	 foreach ( $matches[11] as $match )
 751  	 	 if ( !empty($match) )
 752  	 	 	 $urls['property'][] =
 753  	 	 	 	 preg_replace( '/\\\\(.)/u', '\\1', $match );
 754  
 755  	 return $urls;
 756  }