-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl_translate.py
39 lines (30 loc) · 1.05 KB
/
url_translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import csv
import logging
import os.path as path
import urlparse
from BeautifulSoup import BeautifulSoup
class URLTranslator:
def __init__(self):
self.dictionary = {}
self.location = None
def load_dictionary_from_csv(self, csvfile):
if path.exists(csvfile):
csvreader = csv.reader(open(csvfile))
for row in csvreader:
if len (row) >= 2:
self.dictionary[row[0]] = row[1]
def translate(self, html):
if not html:
return None
soup = BeautifulSoup(html)
links = soup.findAll('a')
for l in links:
href = l['href']
if self.location:
href = urlparse.urljoin(self.location, href)
if self.dictionary.has_key(href):
logging.info('%s translated in %s' % (href, self.dictionary[href]))
l['href'] = self.dictionary[href]
else :
logging.info('missing translation for %s' % (href))
return unicode(soup)