forked from tommyrot/superseriousstats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurltools.php
148 lines (133 loc) · 4.45 KB
/
urltools.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
<?php
/**
* Copyright (c) 2007-2016, Jos de Ruijter <jos@dutnie.nl>
*/
/**
* Various functions related to URL validation and presentation.
*
* Guided by:
* - RFC 3986
* - RFC 1034 section 3.5
* - RFC 1123 section 2.1
*
* Notes:
* - Only the http:// and https:// schemes will validate. URLs without a scheme
* are considered http://.
* - User part in authority is not recognized and will not validate.
* - IPv4 addresses only.
* - TLDs as in http://data.iana.org/TLD/tlds-alpha-by-domain.txt (this file
* can be stored locally and updated at will).
* - The root domain is excluded from the FQDN (not from the other elements).
* - Square brackets must be percent encoded.
*/
class urltools
{
private static $regexp_callback = '';
private static $regexp_complete = '';
private static $valid_tlds = [];
private function __construct()
{
/**
* This is a static class and should not be instantiated.
*/
}
/**
* Normalize and validate a URL and return an array with its elements.
*/
public static function get_elements($url)
{
/**
* Assemble the regular expression if not already done so.
*/
if (self::$regexp_complete === '') {
$domain = '(?<domain>[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?(\.[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?)*)';
$tld = '(?<tld>\.[a-z0-9]([a-z0-9-]{0,61}?[a-z0-9]|[a-z0-9]{0,62})?)';
$fqdn = '(?<fqdn>'.$domain.$tld.')\.?';
$ipv4address = '(?<ipv4address>(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])(\.(25[0-5]|(2[0-4]|1[0-9]|[1-9])?[0-9])){3})';
$port = '(?<port>(6553[0-5]|(655[0-2]|(65[0-4]|(6[0-4]|[1-5][0-9]|[1-9])[0-9]|[1-9])[0-9]|[1-9])?[0-9]))';
$authority = '(?<authority>('.$ipv4address.'|'.$fqdn.')(:'.$port.')?)';
$unreserved = '[a-z0-9_.~-]';
$pct_encoded = '%[0-9a-f]{2}';
$sub_delims = '[!$&\'()*+,;=]';
$pchar = '('.$unreserved.'|'.$pct_encoded.'|'.$sub_delims.'|[:@])';
$fragment = '(?<fragment>(#('.$pchar.'|[\/?])*)?)';
$path = '(?<path>(\/\/?('.$pchar.'+\/?)*)?)';
$query = '(?<query>(\?('.$pchar.'|[\/?])*)?)';
$scheme = '(?<scheme>https?:\/\/)';
self::$regexp_callback = '/^'.$scheme.'?'.$authority.'/i';
self::$regexp_complete = '/^(?<url>'.$scheme.'?'.$authority.$path.$query.$fragment.')$/i';
/**
* Read "tlds-alpha-by-domain.txt" and put all TLDs in an array against which we
* can validate found URLs. If the aforementioned file does not exist or fails
* to be read, the TLD check will not be done. This would be an unexpected and
* undesired exception though.
*/
if (($tlds = file(__DIR__.'/tlds-alpha-by-domain.txt')) === false) {
output::output('notice', __METHOD__.'(): failed to open file: \'tlds-alpha-by-domain.txt\', tld validation disabled');
} else {
foreach ($tlds as $tld) {
$tld = trim($tld);
if ($tld !== '' && strpos($tld, '#') === false) {
self::$valid_tlds[] = '.'.strtolower($tld);
}
}
}
}
/**
* Convert scheme and authority to lower case.
*/
$url = preg_replace_callback(self::$regexp_callback, function ($matches) {
return strtolower($matches[0]);
}, $url);
/**
* Validate and further process the URL.
*/
if (!preg_match(self::$regexp_complete, $url, $matches)) {
return false;
}
/**
* Verify if the TLD is valid. If the validation array is empty we skip this
* step.
*/
if (!empty(self::$valid_tlds) && !empty($matches['tld']) && !in_array($matches['tld'], self::$valid_tlds)) {
return false;
}
/**
* The maximum allowed length of the FQDN (root domain excluded) is 254
* characters.
*/
if (strlen($matches['fqdn']) > 254) {
return false;
}
/**
* If the URL has no scheme, http:// is assumed. Update the elements.
*/
if (empty($matches['scheme'])) {
$matches['scheme'] = 'http://';
$matches['url'] = 'http://'.$matches['url'];
}
/**
* Create and return an array with all the elements of the URL.
*/
$elements = ['url', 'scheme', 'authority', 'ipv4address', 'fqdn', 'domain', 'tld', 'path', 'query', 'fragment'];
foreach ($elements as $element) {
if (empty($matches[$element])) {
/**
* Always pass along an empty string for nonexistent elements.
*/
$urldata[$element] = '';
} else {
$urldata[$element] = $matches[$element];
}
}
/**
* Make sure the only numeric element isn't passed along as a string.
*/
if (empty($matches['port'])) {
$urldata['port'] = 0;
} else {
$urldata['port'] = (int) $matches['port'];
}
return $urldata;
}
}