Moodle 4.1 XRef and Diffs

Search moodle.org's
Developer Documentation
See Release Notes
Long Term Support Release
Bug fixes for general core bugs in 4.1.x will end 13 November 2023 (12 months).
Bug fixes for security issues in 4.1.x will end 10 November 2025 (36 months).
PHP version: minimum PHP 7.4.0 Note: minimum PHP version has increased since Moodle 4.0. PHP 8.0.x is supported too.
Moodle 4.1 Database Schema (by Marcus Green)
/repository/url/ -> locallib.php (source)
   1  <?php
   2  
   3  /**
   4   * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
   5   * All rights reserved.
   6   *
   7   * Redistribution and use in source and binary forms, with or without
   8   * modification, are permitted provided that the following conditions
   9   * are met:
  10   *
  11   *	 * Redistributions of source code must retain the above copyright
  12   *	   notice, this list of conditions and the following disclaimer.
  13   *
  14   *	 * Redistributions in binary form must reproduce the above
  15   *	   copyright notice, this list of conditions and the following
  16   *	   disclaimer in the documentation and/or other materials provided
  17   *	   with the distribution.
  18   *
  19   *	 * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
  20   *	   the names of its contributors may be used to endorse or promote
  21   *	   products derived from this software without specific prior
  22   *	   written permission.
  23   *
  24   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  27   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  28   * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34   * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
  35   * OF SUCH DAMAGE.
  36   */
  37  
  38  /*
  39   * This is a BSD License approved by the Open Source Initiative (OSI).
  40   * See:  http://www.opensource.org/licenses/bsd-license.php
  41   */
  42  
  43  defined('MOODLE_INTERNAL') || die();
  44  
  45  /**
  46   * Combine a base URL and a relative URL to produce a new
  47   * absolute URL.  The base URL is often the URL of a page,
  48   * and the relative URL is a URL embedded on that page.
  49   *
  50   * This function implements the "absolutize" algorithm from
  51   * the RFC3986 specification for URLs.
  52   *
  53   * This function supports multi-byte characters with the UTF-8 encoding,
  54   * per the URL specification.
  55   *
  56   * Parameters:
  57   * 	 baseUrl	 	 the absolute base URL.
  58   *
  59   * 	 url	 	 the relative URL to convert.
  60   *
  61   * Return values:
  62   * 	 An absolute URL that combines parts of the base and relative
  63   * 	 URLs, or FALSE if the base URL is not absolute or if either
  64   * 	 URL cannot be parsed.
  65   */
  66  function url_to_absolute( $baseUrl, $relativeUrl )
  67  {
  68  	 // If relative URL has a scheme, clean path and return.
  69  	 $r = split_url( $relativeUrl );
  70  	 if ( $r === FALSE )
  71  	 	 return FALSE;
  72  	 if ( !empty( $r['scheme'] ) )
  73  	 {
  74  	 	 if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
  75  	 	 	 $r['path'] = url_remove_dot_segments( $r['path'] );
  76  	 	 return join_url( $r );
  77  	 }
  78  
  79  	 // Make sure the base URL is absolute.
  80  	 $b = split_url( $baseUrl );
  81  	 if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
  82  	 	 return FALSE;
  83  	 $r['scheme'] = $b['scheme'];
  84  	 if (empty($b['path'])) {
  85  	 	 $b['path'] = '';
  86  	 }
  87  
  88  	 // If relative URL has an authority, clean path and return.
  89  	 if ( isset( $r['host'] ) )
  90  	 {
  91  	 	 if ( !empty( $r['path'] ) )
  92  	 	 	 $r['path'] = url_remove_dot_segments( $r['path'] );
  93  	 	 return join_url( $r );
  94  	 }
  95  	 unset( $r['port'] );
  96  	 unset( $r['user'] );
  97  	 unset( $r['pass'] );
  98  
  99  	 // Copy base authority.
 100  	 $r['host'] = $b['host'];
 101  	 if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
 102  	 if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
 103  	 if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
 104  
 105  	 // If relative URL has no path, use base path
 106  	 if ( empty( $r['path'] ) )
 107  	 {
 108  	 	 if ( !empty( $b['path'] ) )
 109  	 	 	 $r['path'] = $b['path'];
 110  	 	 if ( !isset( $r['query'] ) && isset( $b['query'] ) )
 111  	 	 	 $r['query'] = $b['query'];
 112  	 	 return join_url( $r );
 113  	 }
 114  
 115  	 // If relative URL path doesn't start with /, merge with base path.
 116  	 if ($r['path'][0] != '/') {
 117  	 	 $base = core_text::strrchr($b['path'], '/', TRUE);
 118  	 	 if ($base === FALSE) {
 119  	 	 	 $base = '';
 120  	 	 }
 121  	 	 $r['path'] = $base . '/' . $r['path'];
 122  	 }
 123  	 $r['path'] = url_remove_dot_segments($r['path']);
 124  	 return join_url($r);
 125  }
 126  
 127  /**
 128   * Filter out "." and ".." segments from a URL's path and return
 129   * the result.
 130   *
 131   * This function implements the "remove_dot_segments" algorithm from
 132   * the RFC3986 specification for URLs.
 133   *
 134   * This function supports multi-byte characters with the UTF-8 encoding,
 135   * per the URL specification.
 136   *
 137   * Parameters:
 138   * 	 path	 the path to filter
 139   *
 140   * Return values:
 141   * 	 The filtered path with "." and ".." removed.
 142   */
 143  function url_remove_dot_segments( $path )
 144  {
 145  	 // multi-byte character explode
 146  	 $inSegs  = preg_split( '!/!u', $path );
 147  	 $outSegs = array( );
 148  	 foreach ( $inSegs as $seg )
 149  	 {
 150  	 	 if ( $seg == '' || $seg == '.')
 151  	 	 	 continue;
 152  	 	 if ( $seg == '..' )
 153  	 	 	 array_pop( $outSegs );
 154  	 	 else
 155  	 	 	 array_push( $outSegs, $seg );
 156  	 }
 157  	 $outPath = implode( '/', $outSegs );
 158  
 159  	 if ($path[0] == '/') {
 160  	 	 $outPath = '/' . $outPath;
 161  	 }
 162  
 163  	 // Compare last multi-byte character against '/'.
 164  	 if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
 165  	 	 $outPath .= '/';
 166  	 }
 167  	 return $outPath;
 168  }
 169  
 170  /**
 171   * This function parses an absolute or relative URL and splits it
 172   * into individual components.
 173   *
 174   * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 175   * A portion of the ABNFs are repeated here:
 176   *
 177   *	 URI-reference	 = URI
 178   *	 	 	 / relative-ref
 179   *
 180   *	 URI	 	 = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 181   *
 182   *	 relative-ref	 = relative-part [ "?" query ] [ "#" fragment ]
 183   *
 184   *	 hier-part	 = "//" authority path-abempty
 185   *	 	 	 / path-absolute
 186   *	 	 	 / path-rootless
 187   *	 	 	 / path-empty
 188   *
 189   *	 relative-part	 = "//" authority path-abempty
 190   *	 	 	 / path-absolute
 191   *	 	 	 / path-noscheme
 192   *	 	 	 / path-empty
 193   *
 194   *	 authority	 = [ userinfo "@" ] host [ ":" port ]
 195   *
 196   * So, a URL has the following major components:
 197   *
 198   *	 scheme
 199   *	 	 The name of a method used to interpret the rest of
 200   *	 	 the URL.  Examples:  "http", "https", "mailto", "file'.
 201   *
 202   *	 authority
 203   *	 	 The name of the authority governing the URL's name
 204   *	 	 space.  Examples:  "example.com", "user@example.com",
 205   *	 	 "example.com:80", "user:password@example.com:80".
 206   *
 207   *	 	 The authority may include a host name, port number,
 208   *	 	 user name, and password.
 209   *
 210   *	 	 The host may be a name, an IPv4 numeric address, or
 211   *	 	 an IPv6 numeric address.
 212   *
 213   *	 path
 214   *	 	 The hierarchical path to the URL's resource.
 215   *	 	 Examples:  "/index.htm", "/scripts/page.php".
 216   *
 217   *	 query
 218   *	 	 The data for a query.  Examples:  "?search=google.com".
 219   *
 220   *	 fragment
 221   *	 	 The name of a secondary resource relative to that named
 222   *	 	 by the path.  Examples:  "#section1", "#header".
 223   *
 224   * An "absolute" URL must include a scheme and path.  The authority, query,
 225   * and fragment components are optional.
 226   *
 227   * A "relative" URL does not include a scheme and must include a path.  The
 228   * authority, query, and fragment components are optional.
 229   *
 230   * This function splits the $url argument into the following components
 231   * and returns them in an associative array.  Keys to that array include:
 232   *
 233   *	 "scheme"	 The scheme, such as "http".
 234   *	 "host"	 	 The host name, IPv4, or IPv6 address.
 235   *	 "port"	 	 The port number.
 236   *	 "user"	 	 The user name.
 237   *	 "pass"	 	 The user password.
 238   *	 "path"	 	 The path, such as a file path for "http".
 239   *	 "query"	 	 The query.
 240   *	 "fragment"	 The fragment.
 241   *
 242   * One or more of these may not be present, depending upon the URL.
 243   *
 244   * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 245   * "path", "query", and "fragment" may have percent-encoded characters
 246   * decoded.  The "scheme" and "port" cannot include percent-encoded
 247   * characters and are never decoded.  Decoding occurs after the URL has
 248   * been parsed.
 249   *
 250   * Parameters:
 251   * 	 url	 	 the URL to parse.
 252   *
 253   * 	 decode	 	 an optional boolean flag selecting whether
 254   * 	 	 	 to decode percent encoding or not.  Default = TRUE.
 255   *
 256   * Return values:
 257   * 	 the associative array of URL parts, or FALSE if the URL is
 258   * 	 too malformed to recognize any parts.
 259   */
 260  function split_url( $url, $decode=FALSE)
 261  {
 262  	 // Character sets from RFC3986.
 263  	 $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
 264  	 $xpchar        = $xunressub . ':@% ';
 265  
 266  	 // Scheme from RFC3986.
 267  	 $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
 268  
 269  	 // User info (user + password) from RFC3986.
 270  	 $xuserinfo     = '((['  . $xunressub . '%]*)' .
 271  	                  '(:([' . $xunressub . ':%]*))?)';
 272  
 273  	 // IPv4 from RFC3986 (without digit constraints).
 274  	 $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
 275  
 276  	 // IPv6 from RFC2732 (without digit and grouping constraints).
 277  	 $xipv6         = '(\[([a-fA-F\d.:]+)\])';
 278  
 279  	 // Host name from RFC1035.  Technically, must start with a letter.
 280  	 // Relax that restriction to better parse URL structure, then
 281  	 // leave host name validation to application.
 282  	 $xhost_name    = '([a-zA-Z\d\-.%]+)';
 283  
 284  	 // Authority from RFC3986.  Skip IP future.
 285  	 $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
 286  	 $xport         = '(\d*)';
 287  	 $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
 288  	 	          '?(:' . $xport . ')?)';
 289  
 290  	 // Path from RFC3986.  Blend absolute & relative for efficiency.
 291  	 $xslash_seg    = '(/[' . $xpchar . ']*)';
 292  	 $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
 293  	 $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
 294  	 $xpath_abs     = '(/(' . $xpath_rel . ')?)';
 295  	 $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
 296  	 	 	  '|' . $xpath_rel . ')';
 297  
 298  	 // Query and fragment from RFC3986.
 299  	 $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
 300  
 301  	 // URL.
 302  	 $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
 303  	                  '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
 304  
 305  
 306  	 // Split the URL into components.
 307  	 if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
 308  	 	 return FALSE;
 309  
 310  	 if ( !empty($m[2]) )	 	 $parts['scheme']  = strtolower($m[2]);
 311  
 312  	 if ( !empty($m[7]) ) {
 313  	 	 if ( isset( $m[9] ) )	 $parts['user']    = $m[9];
 314  	 	 else	 	 	 $parts['user']    = '';
 315  	 }
 316  	 if ( !empty($m[10]) )	 	 $parts['pass']    = $m[11];
 317  
 318  	 if ( !empty($m[13]) )	 	 $h=$parts['host'] = $m[13];
 319  	 else if ( !empty($m[14]) )	 $parts['host']    = $m[14];
 320  	 else if ( !empty($m[16]) )	 $parts['host']    = $m[16];
 321  	 else if ( !empty( $m[5] ) )	 $parts['host']    = '';
 322  	 if ( !empty($m[17]) )	 	 $parts['port']    = $m[18];
 323  
 324  	 if ( !empty($m[19]) )	 	 $parts['path']    = $m[19];
 325  	 else if ( !empty($m[21]) )	 $parts['path']    = $m[21];
 326  	 else if ( !empty($m[25]) )	 $parts['path']    = $m[25];
 327  
 328  	 if ( !empty($m[27]) )	 	 $parts['query']   = $m[28];
 329  	 if ( !empty($m[29]) )	 	 $parts['fragment']= $m[30];
 330  
 331  	 if ( !$decode )
 332  	 	 return $parts;
 333  	 if ( !empty($parts['user']) )
 334  	 	 $parts['user']     = rawurldecode( $parts['user'] );
 335  	 if ( !empty($parts['pass']) )
 336  	 	 $parts['pass']     = rawurldecode( $parts['pass'] );
 337  	 if ( !empty($parts['path']) )
 338  	 	 $parts['path']     = rawurldecode( $parts['path'] );
 339  	 if ( isset($h) )
 340  	 	 $parts['host']     = rawurldecode( $parts['host'] );
 341  	 if ( !empty($parts['query']) )
 342  	 	 $parts['query']    = rawurldecode( $parts['query'] );
 343  	 if ( !empty($parts['fragment']) )
 344  	 	 $parts['fragment'] = rawurldecode( $parts['fragment'] );
 345  	 return $parts;
 346  }
 347  
 348  /**
 349   * This function joins together URL components to form a complete URL.
 350   *
 351   * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 352   * This function implements the specification's "component recomposition"
 353   * algorithm for combining URI components into a full URI string.
 354   *
 355   * The $parts argument is an associative array containing zero or
 356   * more of the following:
 357   *
 358   *	 "scheme"	 The scheme, such as "http".
 359   *	 "host"	 	 The host name, IPv4, or IPv6 address.
 360   *	 "port"	 	 The port number.
 361   *	 "user"	 	 The user name.
 362   *	 "pass"	 	 The user password.
 363   *	 "path"	 	 The path, such as a file path for "http".
 364   *	 "query"	 	 The query.
 365   *	 "fragment"	 The fragment.
 366   *
 367   * The "port", "user", and "pass" values are only used when a "host"
 368   * is present.
 369   *
 370   * The optional $encode argument indicates if appropriate URL components
 371   * should be percent-encoded as they are assembled into the URL.  Encoding
 372   * is only applied to the "user", "pass", "host" (if a host name, not an
 373   * IP address), "path", "query", and "fragment" components.  The "scheme"
 374   * and "port" are never encoded.  When a "scheme" and "host" are both
 375   * present, the "path" is presumed to be hierarchical and encoding
 376   * processes each segment of the hierarchy separately (i.e., the slashes
 377   * are left alone).
 378   *
 379   * The assembled URL string is returned.
 380   *
 381   * Parameters:
 382   * 	 parts	 	 an associative array of strings containing the
 383   * 	 	 	 individual parts of a URL.
 384   *
 385   * 	 encode	 	 an optional boolean flag selecting whether
 386   * 	 	 	 to do percent encoding or not.  Default = true.
 387   *
 388   * Return values:
 389   * 	 Returns the assembled URL string.  The string is an absolute
 390   * 	 URL if a scheme is supplied, and a relative URL if not.  An
 391   * 	 empty string is returned if the $parts array does not contain
 392   * 	 any of the needed values.
 393   */
 394  function join_url( $parts, $encode=FALSE)
 395  {
 396  	 if ( $encode )
 397  	 {
 398  	 	 if ( isset( $parts['user'] ) )
 399  	 	 	 $parts['user']     = rawurlencode( $parts['user'] );
 400  	 	 if ( isset( $parts['pass'] ) )
 401  	 	 	 $parts['pass']     = rawurlencode( $parts['pass'] );
 402  	 	 if ( isset( $parts['host'] ) &&
 403  	 	 	 !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
 404  	 	 	 $parts['host']     = rawurlencode( $parts['host'] );
 405  	 	 if ( !empty( $parts['path'] ) )
 406  	 	 	 $parts['path']     = preg_replace( '!%2F!ui', '/',
 407  	 	 	 	 rawurlencode( $parts['path'] ) );
 408  	 	 if ( isset( $parts['query'] ) )
 409  	 	 	 $parts['query']    = rawurlencode( $parts['query'] );
 410  	 	 if ( isset( $parts['fragment'] ) )
 411  	 	 	 $parts['fragment'] = rawurlencode( $parts['fragment'] );
 412  	 }
 413  
 414  	 $url = '';
 415  	 if ( !empty( $parts['scheme'] ) )
 416  	 	 $url .= $parts['scheme'] . ':';
 417  	 if ( isset( $parts['host'] ) )
 418  	 {
 419  	 	 $url .= '//';
 420  	 	 if ( isset( $parts['user'] ) )
 421  	 	 {
 422  	 	 	 $url .= $parts['user'];
 423  	 	 	 if ( isset( $parts['pass'] ) )
 424  	 	 	 	 $url .= ':' . $parts['pass'];
 425  	 	 	 $url .= '@';
 426  	 	 }
 427  	 	 if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
 428  	 	 	 $url .= '[' . $parts['host'] . ']';	 // IPv6
 429  	 	 else
 430  	 	 	 $url .= $parts['host'];	 	 	 // IPv4 or name
 431  	 	 if ( isset( $parts['port'] ) )
 432  	 	 	 $url .= ':' . $parts['port'];
 433  	 	 if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
 434  	 	 	 $url .= '/';
 435  	 }
 436  	 if ( !empty( $parts['path'] ) )
 437  	 	 $url .= $parts['path'];
 438  	 if ( isset( $parts['query'] ) )
 439  	 	 $url .= '?' . $parts['query'];
 440  	 if ( isset( $parts['fragment'] ) )
 441  	 	 $url .= '#' . $parts['fragment'];
 442  	 return $url;
 443  }
 444  
 445  /**
 446   * This function encodes URL to form a URL which is properly
 447   * percent encoded to replace disallowed characters.
 448   *
 449   * RFC3986 specifies the allowed characters in the URL as well as
 450   * reserved characters in the URL. This function replaces all the
 451   * disallowed characters in the URL with their repective percent
 452   * encodings. Already encoded characters are not encoded again,
 453   * such as '%20' is not encoded to '%2520'.
 454   *
 455   * Parameters:
 456   * 	 url	 	 the url to encode.
 457   *
 458   * Return values:
 459   * 	 Returns the encoded URL string.
 460   */
 461  function encode_url($url) {
 462    $reserved = array(
 463      ":" => '!%3A!ui',
 464      "/" => '!%2F!ui',
 465      "?" => '!%3F!ui',
 466      "#" => '!%23!ui',
 467      "[" => '!%5B!ui',
 468      "]" => '!%5D!ui',
 469      "@" => '!%40!ui',
 470      "!" => '!%21!ui',
 471      "$" => '!%24!ui',
 472      "&" => '!%26!ui',
 473      "'" => '!%27!ui',
 474      "(" => '!%28!ui',
 475      ")" => '!%29!ui',
 476      "*" => '!%2A!ui',
 477      "+" => '!%2B!ui',
 478      "," => '!%2C!ui',
 479      ";" => '!%3B!ui',
 480      "=" => '!%3D!ui',
 481      "%" => '!%25!ui',
 482    );
 483  
 484    $url = rawurlencode($url);
 485    $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
 486    return $url;
 487  }
 488  
 489  /**
 490   * Extract URLs from a web page.
 491   *
 492   * URLs are extracted from a long list of tags and attributes as defined
 493   * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
 494   * URLs are also extracted from tags and attributes that are common
 495   * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
 496   * and from WML 1.3 and 2.0.
 497   *
 498   * The function returns an associative array of associative arrays of
 499   * arrays of URLs.  The outermost array's keys are the tag (element) name,
 500   * such as "a" for <a> or "img" for <img>.  The values for these entries
 501   * are associative arrays where the keys are attribute names for those
 502   * tags, such as "href" for <a href="...">.  Finally, the values for
 503   * those arrays are URLs found in those tags and attributes throughout
 504   * the text.
 505   *
 506   * Parameters:
 507   * 	 text	 	 the UTF-8 text to scan
 508   *
 509   * Return values:
 510   * 	 an associative array where keys are tags and values are an
 511   * 	 associative array where keys are attributes and values are
 512   * 	 an array of URLs.
 513   *
 514   * See:
 515   * 	 http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
 516   */
 517  function extract_html_urls( $text )
 518  {
 519  	 $match_elements = array(
 520  	 	 // HTML
 521  	 	 array('element'=>'a',	 	 'attribute'=>'href'),	 	 // 2.0
 522  	 	 array('element'=>'a',	 	 'attribute'=>'urn'),	 	 // 2.0
 523  	 	 array('element'=>'base',	 'attribute'=>'href'),	 	 // 2.0
 524  	 	 array('element'=>'form',	 'attribute'=>'action'),	 	 // 2.0
 525  	 	 array('element'=>'img',	 	 'attribute'=>'src'),	 	 // 2.0
 526  	 	 array('element'=>'link',	 'attribute'=>'href'),	 	 // 2.0
 527  
 528  	 	 array('element'=>'applet',	 'attribute'=>'code'),	 	 // 3.2
 529  	 	 array('element'=>'applet',	 'attribute'=>'codebase'),	 // 3.2
 530  	 	 array('element'=>'area',	 'attribute'=>'href'),	 	 // 3.2
 531  	 	 array('element'=>'body',	 'attribute'=>'background'),	 // 3.2
 532  	 	 array('element'=>'img',	 	 'attribute'=>'usemap'),	 	 // 3.2
 533  	 	 array('element'=>'input',	 'attribute'=>'src'),	 	 // 3.2
 534  
 535  	 	 array('element'=>'applet',	 'attribute'=>'archive'),	 // 4.01
 536  	 	 array('element'=>'applet',	 'attribute'=>'object'),	 	 // 4.01
 537  	 	 array('element'=>'blockquote',	 'attribute'=>'cite'),	 	 // 4.01
 538  	 	 array('element'=>'del',	 	 'attribute'=>'cite'),	 	 // 4.01
 539  	 	 array('element'=>'frame',	 'attribute'=>'longdesc'),	 // 4.01
 540  	 	 array('element'=>'frame',	 'attribute'=>'src'),	 	 // 4.01
 541  	 	 array('element'=>'head',	 'attribute'=>'profile'),	 // 4.01
 542  	 	 array('element'=>'iframe',	 'attribute'=>'longdesc'),	 // 4.01
 543  	 	 array('element'=>'iframe',	 'attribute'=>'src'),	 	 // 4.01
 544  	 	 array('element'=>'img',	 	 'attribute'=>'longdesc'),	 // 4.01
 545  	 	 array('element'=>'input',	 'attribute'=>'usemap'),	 	 // 4.01
 546  	 	 array('element'=>'ins',	 	 'attribute'=>'cite'),	 	 // 4.01
 547  	 	 array('element'=>'object',	 'attribute'=>'archive'),	 // 4.01
 548  	 	 array('element'=>'object',	 'attribute'=>'classid'),	 // 4.01
 549  	 	 array('element'=>'object',	 'attribute'=>'codebase'),	 // 4.01
 550  	 	 array('element'=>'object',	 'attribute'=>'data'),	 	 // 4.01
 551  	 	 array('element'=>'object',	 'attribute'=>'usemap'),	 	 // 4.01
 552  	 	 array('element'=>'q',	 	 'attribute'=>'cite'),	 	 // 4.01
 553  	 	 array('element'=>'script',	 'attribute'=>'src'),	 	 // 4.01
 554  
 555  	 	 array('element'=>'audio',	 'attribute'=>'src'),	 	 // 5.0
 556  	 	 array('element'=>'command',	 'attribute'=>'icon'),	 	 // 5.0
 557  	 	 array('element'=>'embed',	 'attribute'=>'src'),	 	 // 5.0
 558  	 	 array('element'=>'event-source','attribute'=>'src'),	 	 // 5.0
 559  	 	 array('element'=>'html',	 'attribute'=>'manifest'),	 // 5.0
 560  	 	 array('element'=>'source',	 'attribute'=>'src'),	 	 // 5.0
 561  	 	 array('element'=>'video',	 'attribute'=>'src'),	 	 // 5.0
 562  	 	 array('element'=>'video',	 'attribute'=>'poster'),	 	 // 5.0
 563  
 564  	 	 array('element'=>'bgsound',	 'attribute'=>'src'),	 	 // Extension
 565  	 	 array('element'=>'body',	 'attribute'=>'credits'),	 // Extension
 566  	 	 array('element'=>'body',	 'attribute'=>'instructions'),	 // Extension
 567  	 	 array('element'=>'body',	 'attribute'=>'logo'),	 	 // Extension
 568  	 	 array('element'=>'div',	 	 'attribute'=>'href'),	 	 // Extension
 569  	 	 array('element'=>'div',	 	 'attribute'=>'src'),	 	 // Extension
 570  	 	 array('element'=>'embed',	 'attribute'=>'code'),	 	 // Extension
 571  	 	 array('element'=>'embed',	 'attribute'=>'pluginspage'),	 // Extension
 572  	 	 array('element'=>'html',	 'attribute'=>'background'),	 // Extension
 573  	 	 array('element'=>'ilayer',	 'attribute'=>'src'),	 	 // Extension
 574  	 	 array('element'=>'img',	 	 'attribute'=>'dynsrc'),	 	 // Extension
 575  	 	 array('element'=>'img',	 	 'attribute'=>'lowsrc'),	 	 // Extension
 576  	 	 array('element'=>'input',	 'attribute'=>'dynsrc'),	 	 // Extension
 577  	 	 array('element'=>'input',	 'attribute'=>'lowsrc'),	 	 // Extension
 578  	 	 array('element'=>'table',	 'attribute'=>'background'),	 // Extension
 579  	 	 array('element'=>'td',	 	 'attribute'=>'background'),	 // Extension
 580  	 	 array('element'=>'th',	 	 'attribute'=>'background'),	 // Extension
 581  	 	 array('element'=>'layer',	 'attribute'=>'src'),	 	 // Extension
 582  	 	 array('element'=>'xml',	 	 'attribute'=>'src'),	 	 // Extension
 583  
 584  	 	 array('element'=>'button',	 'attribute'=>'action'),	 	 // Forms 2.0
 585  	 	 array('element'=>'datalist',	 'attribute'=>'data'),	 	 // Forms 2.0
 586  	 	 array('element'=>'form',	 'attribute'=>'data'),	 	 // Forms 2.0
 587  	 	 array('element'=>'input',	 'attribute'=>'action'),	 	 // Forms 2.0
 588  	 	 array('element'=>'select',	 'attribute'=>'data'),	 	 // Forms 2.0
 589  
 590  	 	 // XHTML
 591  	 	 array('element'=>'html',	 'attribute'=>'xmlns'),
 592  
 593  	 	 // WML
 594  	 	 array('element'=>'access',	 'attribute'=>'path'),	 	 // 1.3
 595  	 	 array('element'=>'card',	 'attribute'=>'onenterforward'),	 // 1.3
 596  	 	 array('element'=>'card',	 'attribute'=>'onenterbackward'),// 1.3
 597  	 	 array('element'=>'card',	 'attribute'=>'ontimer'),	 // 1.3
 598  	 	 array('element'=>'go',	 	 'attribute'=>'href'),	 	 // 1.3
 599  	 	 array('element'=>'option',	 'attribute'=>'onpick'),	 	 // 1.3
 600  	 	 array('element'=>'template',	 'attribute'=>'onenterforward'),	 // 1.3
 601  	 	 array('element'=>'template',	 'attribute'=>'onenterbackward'),// 1.3
 602  	 	 array('element'=>'template',	 'attribute'=>'ontimer'),	 // 1.3
 603  	 	 array('element'=>'wml',	 	 'attribute'=>'xmlns'),	 	 // 2.0
 604  	 );
 605  
 606  	 $match_metas = array(
 607  	 	 'content-base',
 608  	 	 'content-location',
 609  	 	 'referer',
 610  	 	 'location',
 611  	 	 'refresh',
 612  	 );
 613  
 614  	 // Extract all elements
 615  	 if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
 616  	 	 return array( );
 617  	 $elements = $matches[1];
 618  	 $value_pattern = '=(("([^"]*)")|([^\s]*))';
 619  
 620  	 // Match elements and attributes
 621  	 foreach ( $match_elements as $match_element )
 622  	 {
 623  	 	 $name = $match_element['element'];
 624  	 	 $attr = $match_element['attribute'];
 625  	 	 $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
 626  	 	 if ( $name == 'object' )
 627  	 	 	 $split_pattern = '/\s*/u';	 // Space-separated URL list
 628  	 	 else if ( $name == 'archive' )
 629  	 	 	 $split_pattern = '/,\s*/u';	 // Comma-separated URL list
 630  	 	 else
 631  	 	 	 unset( $split_pattern );	 // Single URL
 632  	 	 foreach ( $elements as $element )
 633  	 	 {
 634  	 	 	 if ( !preg_match( $pattern, $element, $match ) )
 635  	 	 	 	 continue;
 636  	 	 	 $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
 637  	 	 	 if ( !isset( $split_pattern ) )
 638  	 	 	 	 $urls[$name][$attr][] = $m;
 639  	 	 	 else
 640  	 	 	 {
 641  	 	 	 	 $msplit = preg_split( $split_pattern, $m );
 642  	 	 	 	 foreach ( $msplit as $ms )
 643  	 	 	 	 	 $urls[$name][$attr][] = $ms;
 644  	 	 	 }
 645  	 	 }
 646  	 }
 647  
 648  	 // Match meta http-equiv elements
 649  	 foreach ( $match_metas as $match_meta )
 650  	 {
 651  	 	 $attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
 652  	 	 $content_pattern = '/content'  . $value_pattern . '/iu';
 653  	 	 $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
 654  	 	 foreach ( $elements as $element )
 655  	 	 {
 656  	 	 	 if ( !preg_match( '/^meta/iu', $element ) ||
 657  	 	 	 	 !preg_match( $attr_pattern, $element ) ||
 658  	 	 	 	 !preg_match( $content_pattern, $element, $match ) )
 659  	 	 	 	 continue;
 660  	 	 	 $m = empty($match[3]) ? $match[4] : $match[3];
 661  	 	 	 if ( $match_meta != 'refresh' )
 662  	 	 	 	 $urls['meta']['http-equiv'][] = $m;
 663  	 	 	 else if ( preg_match( $refresh_pattern, $m, $match ) )
 664  	 	 	 	 $urls['meta']['http-equiv'][] = $match[2];
 665  	 	 }
 666  	 }
 667  
 668  	 // Match style attributes
 669  	 $urls['style'] = array( );
 670  	 $style_pattern = '/style' . $value_pattern . '/iu';
 671  	 foreach ( $elements as $element )
 672  	 {
 673  	 	 if ( !preg_match( $style_pattern, $element, $match ) )
 674  	 	 	 continue;
 675  	 	 $m = empty($match[3]) ? $match[4] : $match[3];
 676  	 	 $style_urls = extract_css_urls( $m );
 677  	 	 if ( !empty( $style_urls ) )
 678  	 	 	 $urls['style'] = array_merge_recursive(
 679  	 	 	 	 $urls['style'], $style_urls );
 680  	 }
 681  
 682  	 // Match style bodies
 683  	 if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
 684  	 {
 685  	 	 foreach ( $style_bodies[1] as $style_body )
 686  	 	 {
 687  	 	 	 $style_urls = extract_css_urls( $style_body );
 688  	 	 	 if ( !empty( $style_urls ) )
 689  	 	 	 	 $urls['style'] = array_merge_recursive(
 690  	 	 	 	 	 $urls['style'], $style_urls );
 691  	 	 }
 692  	 }
 693  	 if ( empty($urls['style']) )
 694  	 	 unset( $urls['style'] );
 695  
 696  	 return $urls;
 697  }
 698  /**
 699   * Extract URLs from UTF-8 CSS text.
 700   *
 701   * URLs within @import statements and url() property functions are extracted
 702   * and returned in an associative array of arrays.  Array keys indicate
 703   * the use context for the URL, including:
 704   *
 705   * 	 "import"
 706   * 	 "property"
 707   *
 708   * Each value in the associative array is an array of URLs.
 709   *
 710   * Parameters:
 711   * 	 text	 	 the UTF-8 text to scan
 712   *
 713   * Return values:
 714   * 	 an associative array of arrays of URLs.
 715   *
 716   * See:
 717   * 	 http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
 718   */
 719  function extract_css_urls( $text )
 720  {
 721  	 $urls = array( );
 722  
 723  	 $url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
 724  	 $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
 725  	 $pattern         = '/(' .
 726  	 	  '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
 727  	 	 '|(@import\s*'      . $urlfunc_pattern . ')'      .
 728  	 	 '|('                . $urlfunc_pattern . ')'      .  ')/iu';
 729  	 if ( !preg_match_all( $pattern, $text, $matches ) )
 730  	 	 return $urls;
 731  
 732  	 // @import '...'
 733  	 // @import "..."
 734  	 foreach ( $matches[3] as $match )
 735  	 	 if ( !empty($match) )
 736  	 	 	 $urls['import'][] =
 737  	 	 	 	 preg_replace( '/\\\\(.)/u', '\\1', $match );
 738  
 739  	 // @import url(...)
 740  	 // @import url('...')
 741  	 // @import url("...")
 742  	 foreach ( $matches[7] as $match )
 743  	 	 if ( !empty($match) )
 744  	 	 	 $urls['import'][] =
 745  	 	 	 	 preg_replace( '/\\\\(.)/u', '\\1', $match );
 746  
 747  	 // url(...)
 748  	 // url('...')
 749  	 // url("...")
 750  	 foreach ( $matches[11] as $match )
 751  	 	 if ( !empty($match) )
 752  	 	 	 $urls['property'][] =
 753  	 	 	 	 preg_replace( '/\\\\(.)/u', '\\1', $match );
 754  
 755  	 return $urls;
 756  }