284 lines
9.0 KiB
PHP
284 lines
9.0 KiB
PHP
<?php
|
|
/**
|
|
* Syntax based normalization of URI's
|
|
*
|
|
* This normalises URI's based on the specification RFC 3986
|
|
* www . apps . ietf . org/rfc/rfc3986.html
|
|
*
|
|
* Example usage:
|
|
* <code>
|
|
* require_once 'PageCacheURLNormalizer.php';
|
|
*
|
|
* $url = 'eXAMPLE://a/./b/../b/%63/%7bfoo%7d';
|
|
* $un = new PageCacheURLNormalizer();
|
|
* $un->setUrl( $url );
|
|
* echo $un->normalize();
|
|
*
|
|
* // result: "example://a/b/c/%7Bfoo%7D"
|
|
* </code>
|
|
*
|
|
* @author Glen Scott <glen_scott @ yahoo.co.uk>
|
|
* @copyright Glen Scott
|
|
* @license Open Source
|
|
*/
|
|
|
|
if (!defined('_PS_VERSION_')) {exit;}
|
|
|
|
// Check existence of the class to be able to handle compatibility problems in a friendly way
|
|
if (!class_exists('PageCacheURLNormalizer')) {
|
|
|
|
if (!function_exists('pagecacheRawurlencode')) {
|
|
function pagecacheRawurlencode($value)
|
|
{
|
|
if (is_array($value)) {
|
|
return array_map('pagecacheRawurlencode', array_values($value));
|
|
} else {
|
|
return rawurldecode($value);
|
|
}
|
|
}
|
|
}
|
|
|
|
class PageCacheURLNormalizer
|
|
{
|
|
private $url;
|
|
private $scheme;
|
|
private $host;
|
|
private $port;
|
|
private $user;
|
|
private $pass;
|
|
private $path;
|
|
private $query;
|
|
private $fragment;
|
|
private $default_scheme_ports = array('http:' => 80, 'https:' => 443,);
|
|
private $components = array('scheme', 'host', 'port', 'user', 'pass', 'path', 'query', 'fragment',);
|
|
|
|
public function __construct($url = null)
|
|
{
|
|
if ($url) {
|
|
$this->setUrl($url);
|
|
}
|
|
}
|
|
|
|
public function getUrl()
|
|
{
|
|
return $this->url;
|
|
}
|
|
|
|
public function setUrl($url)
|
|
{
|
|
$this->url = $url;
|
|
|
|
// parse URL into respective parts
|
|
$url_components = parse_url($this->url);
|
|
|
|
if (!$url_components) {
|
|
// Reset URL
|
|
$this->url = '';
|
|
|
|
// Flush properties
|
|
foreach ($this->components as $key) {
|
|
if (property_exists($this, $key)) {
|
|
$this->$key = '';
|
|
}
|
|
}
|
|
|
|
return false;
|
|
} else {
|
|
// Update properties
|
|
foreach ($url_components as $key => $value) {
|
|
if (property_exists($this, $key)) {
|
|
$this->$key = $value;
|
|
}
|
|
}
|
|
|
|
// Flush missing components
|
|
$missing_components = array_diff(
|
|
array_values($this->components),
|
|
array_keys($url_components)
|
|
);
|
|
|
|
foreach ($missing_components as $key) {
|
|
if (property_exists($this, $key)) {
|
|
$this->$key = '';
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
public function normalize()
|
|
{
|
|
|
|
// URI Syntax Components
|
|
// scheme authority path query fragment
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3
|
|
|
|
// Scheme
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.1
|
|
|
|
if ($this->scheme) {
|
|
// Converting the scheme to lower case
|
|
$this->scheme = Tools::strtolower($this->scheme) . ':';
|
|
}
|
|
|
|
// Authority
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.2
|
|
|
|
$authority = '';
|
|
if ($this->host) {
|
|
$authority .= '//';
|
|
|
|
// User Information
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.2.1
|
|
|
|
if ($this->user) {
|
|
if ($this->pass) {
|
|
$authority .= $this->user . ':' . $this->pass . '@';
|
|
} else {
|
|
$authority .= $this->user . '@';
|
|
}
|
|
}
|
|
|
|
// Host
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.2.2
|
|
|
|
// Converting the host to lower case
|
|
$authority .= Tools::strtolower($this->host);
|
|
|
|
// Port
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.2.3
|
|
|
|
// Removing the default port
|
|
if (isset($this->default_scheme_ports[$this->scheme])
|
|
&& $this->port == $this->default_scheme_ports[$this->scheme]) {
|
|
$this->port = '';
|
|
}
|
|
|
|
if ($this->port) {
|
|
$authority .= ':' . $this->port;
|
|
}
|
|
}
|
|
|
|
// Path
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.3
|
|
|
|
if ($this->path) {
|
|
// Removing dot-segments
|
|
$this->path = $this->removeDotSegments($this->path);
|
|
|
|
// Do not urlDecodeUnreservedChars to preserve Greek URLs for exemple
|
|
//$this->path = $this->urlDecodeUnreservedChars( $this->path );
|
|
|
|
$this->path = $this->urlDecodeReservedSubDelimChars($this->path);
|
|
} // Add default path only when valid URL is present
|
|
elseif ($this->url) {
|
|
// Adding trailing /
|
|
$this->path = '/';
|
|
}
|
|
|
|
// Query
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.4
|
|
|
|
if ($this->query) {
|
|
parse_str($this->query, $query);
|
|
|
|
$keys = array_map('rawurldecode', array_keys($query));
|
|
$values = array_map('pagecacheRawurlencode', array_values($query));
|
|
$query = array_combine($keys, $values);
|
|
|
|
$this->query = '?' . str_replace('+', '%20', http_build_query($query, '', '&'));
|
|
|
|
// Fix http_build_query adding equals sign to empty keys
|
|
$this->query = str_replace('=&', '&', rtrim($this->query, '='));
|
|
}
|
|
|
|
// Fragment
|
|
// @link www.apps.ietf . org/rfc/rfc3986.html#sec-3.5
|
|
|
|
if ($this->fragment) {
|
|
$this->fragment = rawurldecode($this->fragment);
|
|
$this->fragment = rawurlencode($this->fragment);
|
|
$this->fragment = '#' . $this->fragment;
|
|
}
|
|
|
|
$this->setUrl($this->scheme . $authority . $this->path . $this->query . $this->fragment);
|
|
|
|
return $this->getUrl();
|
|
}
|
|
|
|
/**
|
|
* Path segment normalization
|
|
* www.apps.ietf . org/rfc/rfc3986.html#sec-5.2.4
|
|
*/
|
|
public function removeDotSegments($path)
|
|
{
|
|
$new_path = '';
|
|
|
|
while (!empty($path)) {
|
|
// A
|
|
$pattern_a = '!^(\.\./|\./)!x';
|
|
$pattern_b_1 = '!^(/\./)!x';
|
|
$pattern_b_2 = '!^(/\.)$!x';
|
|
$pattern_c = '!^(/\.\./|/\.\.)!x';
|
|
$pattern_d = '!^(\.|\.\.)$!x';
|
|
$pattern_e = '!(/*[^/]*)!x';
|
|
|
|
if (preg_match($pattern_a, $path)) {
|
|
// remove prefix from $path
|
|
$path = preg_replace($pattern_a, '', $path);
|
|
} elseif (preg_match($pattern_b_1, $path, $matches) || preg_match($pattern_b_2, $path, $matches)) {
|
|
$path = preg_replace("!^" . $matches[1] . "!", '/', $path);
|
|
} elseif (preg_match($pattern_c, $path, $matches)) {
|
|
$path = preg_replace('!^' . preg_quote($matches[1], '!') . '!x', '/', $path);
|
|
|
|
// remove the last segment and its preceding "/" (if any) from output buffer
|
|
$new_path = preg_replace('!/([^/]+)$!x', '', $new_path);
|
|
} elseif (preg_match($pattern_d, $path)) {
|
|
$path = preg_replace($pattern_d, '', $path);
|
|
} else {
|
|
if (preg_match($pattern_e, $path, $matches)) {
|
|
$first_path_segment = $matches[1];
|
|
|
|
$path = preg_replace('/^' . preg_quote($first_path_segment, '/') . '/', '', $path, 1);
|
|
|
|
$new_path .= $first_path_segment;
|
|
}
|
|
}
|
|
}
|
|
|
|
return $new_path;
|
|
}
|
|
|
|
public function getScheme()
|
|
{
|
|
return $this->scheme;
|
|
}
|
|
|
|
/**
|
|
* Decode unreserved characters
|
|
*
|
|
* @link www.apps.ietf . org/rfc/rfc3986.html#sec-2.3
|
|
*/
|
|
public function urlDecodeUnreservedChars($string)
|
|
{
|
|
$string = rawurldecode($string);
|
|
$string = rawurlencode($string);
|
|
$string = str_replace(array('%2F', '%3A', '%40'), array('/', ':', '@'), $string);
|
|
|
|
return $string;
|
|
}
|
|
|
|
/**
|
|
* Decode reserved sub-delims
|
|
*
|
|
* @link www.apps.ietf . org/rfc/rfc3986.html#sec-2.2
|
|
*/
|
|
public function urlDecodeReservedSubDelimChars($string)
|
|
{
|
|
return str_replace(array('%21', '%24', '%26', '%27', '%28', '%29', '%2A', '%2B', '%2C', '%3B', '%3D'),
|
|
array('!', '$', '&', "'", '(', ')', '*', '+', ',', ';', '='), $string);
|
|
}
|
|
}
|
|
}
|