This repository has been archived by the owner on Nov 11, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlengthener.py
96 lines (86 loc) · 3.93 KB
/
lengthener.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/python3
# Copyright (c) 2011 Mark Eichin <eichin@thok.org>
# See ./LICENSE (MIT style.)
"""un-shorten a url (if it's from a white-listed set of shorteners.)"""
__version__ = "0.5"
__author__ = "Mark Eichin <eichin@thok.org>"
__license__ = "MIT"
import http.client
import urllib.parse
import sys
# "general" shorteners allow end users (public or customers) to generate
# links to *any* site, represented by the domain plus a short alphanumeric
# token (tinyarrows.com and txtn.us are notable exceptions, which I'll add
# if I ever see anyone actually using them :-)
general_shorteners = set([
"bit.ly",
"ow.ly",
"ht.ly", # another ow.ly/hootsuite shortener
"j.mp",
"dlvr.it",
"goo.gl",
"is.gd", # http://is.gd/ethics.php - also v.gd, but it is preview-only
"1.usa.gov",
"awe.sm", # http://totally.awe.sm/ "analytics for social media"
"wp.me", # wordpress.com
])
# feeds.feedburner.com is a (google) redirector, but at least the url is semi
# informative, if not final (and not shortened.) New category?
# "local" shorteners let a site manage/track popular internal links directly,
# and make detailed urls more easily shareable. As with other shorteners,
# these are followed by a single alphanumeric token (and sometimes cgi arguments)
# but local ones always link to a particular target site; we might want to
# explicitly check for that in the future.
local_shorteners = set([
"engt.co", # engadget.com
"adafru.it", # adafruit.com
"wapo.st", # washingtonpost.com
"df4.us", # daringfireball.com
"onforb.es", # forbes.com
"on.cnn.com", # cnn.com (uhhh...)
"nyti.ms", # nytimes.com
"4sq.com", # foursquare.com
"youtu.be", # youtube.com
"bo.st", # boston.com
"say.ly", # whosay.com
"mysp.ac", # myspace.com
"huff.po", # huffingtonpost.com
"apne.ws", # ap mobile
"kck.st", # kickstarter.com
"yhoo.it", # answers.yahoo.com, maybe more of yahoo
"read.bi", # businessinsider.com
"gu.com", # guardian.co.uk
"tnw.co", # thenextweb.com
"sprkfn.com", # sparkfun.com
"n.pr", # npr.org
"nydn.us", # newyorkdailynews.com
"vrge.co", # theverge.com
])
# can't use urllib2 for this - in order to avoid redirected POSTs, it turns
# any redirect into a GET... even if the redirect was originally a HEAD.
# Will dig in to that later... stackoverflow gets credit for pointing me in
# the right direction:
# http://stackoverflow.com/questions/107405/how-do-you-send-a-head-http-request-in-python
# though the actual answers there are incomplete or incorrect.
# Initially, no caching, just a lookup with a fast timeout.
# No cookie support either, or specific user-agent, until we find we need one.
def lengthen(url):
"""try to get a redirect, return the redirect if found"""
split_url = urllib.parse.urlsplit(url)
if split_url.netloc not in general_shorteners and split_url.netloc not in local_shorteners:
return
if split_url.scheme != "http":
return
if split_url.query: # or add this back in?
return
if split_url.fragment: # or add this back in?
return
host_connection = http.client.HTTPConnection(split_url.netloc)
host_connection.request("HEAD", split_url.path)
head_response = host_connection.getresponse()
location = head_response.getheader("location")
host_connection.close()
return location
if __name__ == "__main__":
for url in sys.argv[1:]:
print(url, "->", lengthen(url))