This repository has been archived by the owner on Jan 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathauthor_pages.py
139 lines (116 loc) · 4.05 KB
/
author_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
"""
Scrape author articles from thephoenix.com
Currenly focused on Boston, although seems to generally be working for the
others. Pass in city before author slug -- defaults to Boston.
There might be a better way: word is that there is a private API.
Usage:
python author_pages.py [city] author-slug
Prints out each article as a JSON structure. Tail author_pages.log to see
script progress.
"""
import datetime
import logging
import json
import sys
import time
import urllib2
import lxml.html
logging.basicConfig(filename='author_pages.log', level=logging.INFO)
SITE_ROOTS = {
'boston': 'http://thephoenix.com/boston',
'portland': 'http://portland.thephoenix.com',
'providence': 'http://providence.thephoenix.com'
}
AUTHOR_URL = '%(site_root)s/authors/%(author_slug)s/'
DELAY=2 # seconds to wait between page fetches
last_fetch = None
def _wait():
if last_fetch is None:
return 0
return DELAY - (datetime.datetime.now() - last_fetch).seconds
def fetch(url):
global last_fetch
delta = _wait()
if delta > 0:
logging.info('waiting: %s seconds' % delta)
time.sleep(delta)
last_fetch = datetime.datetime.now()
logging.info('fetching: %s' % url)
return urllib2.urlopen(url).read()
def prepend_link(city, link):
assert link.startswith('/'), 'Unsupported relative link'
if link.lower().startswith('/boston/'):
link = link[len('/boston'):]
return '%s%s' % (SITE_ROOTS[city], link)
def next_page_link(links):
for link in links:
if 'next' in link.text_content():
return link
def fetch_author_page(city, url, first=True):
page = fetch(url)
doc = lxml.html.fromstring(page)
title = None
author = None
pubdate = None
teaser = None
topics = None
if first:
article = doc.cssselect('div#articlecontent')[0]
title = article.cssselect('h1')[0].text_content()
author_span = article.cssselect('span.author')[0]
author = author_span.cssselect('a strong')[0].text_content()
pubdate = author_span.text_content().split('|')[-1].strip()
teaser = article.cssselect('div.teaser')[0].text_content()
topics = [t.text_content() for t in article.cssselect('a[rel="TOPIC"]')]
article = doc.cssselect('div#articlecontent .bodyText')
content = [e.text_content() for e in article]
next_page = next_page_link(doc.cssselect('div#articlecontent a'))
if next_page is not None:
next_pages = fetch_author_page(city, prepend_link(city,
next_page.get('href')), first=False)
content += next_pages['content']
return {
'title': title,
'author': author,
'pubdate': pubdate,
'teaser': teaser,
'topics': topics,
'content': content
}
def _fetch_articles(city, links):
for link in links:
href = link.get('href')
if not href.startswith('http:'):
href = prepend_link(city, href)
yield json.dumps(fetch_author_page(city, href))
def fetch_author_articles(city, author, url=None):
if url is None:
url = AUTHOR_URL % {
'site_root': SITE_ROOTS[city],
'author_slug': author
}
page = fetch(url)
doc = lxml.html.fromstring(page)
links = doc.cssselect('div#ArticleList h3 a')
index_links_div = doc.cssselect('div#ArticleList')[0].getparent().getnext()
next_page = next_page_link(index_links_div.cssselect('a'))
for article in _fetch_articles(city, links):
yield article
if next_page is not None:
href = prepend_link(city, next_page.get('href'))
logging.info('Fetching next index page: %s' % href)
fetch_author_articles(city, author, href)
if __name__=="__main__":
city = 'boston'
if len(sys.argv) == 1:
print '\nPlease indicate an author\n'
sys.exit(0)
elif len(sys.argv) == 2:
city = 'boston'
author = sys.argv[1]
else:
city = sys.argv[1].lower()
author = sys.argv[2]
for article in fetch_author_articles(city, author):
print article