-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnytimes_helper.py
144 lines (116 loc) · 5.2 KB
/
nytimes_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from location_helper import insert_location_into_articles
from collections import defaultdict
import datetime
from watson import get_sentiment
def retrieve_article_data(begin_date, end_date, pages, key):
url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?'
df_articles = pd.DataFrame(data={'url': [],
'url_image': [],
'article': [],
'datetime': [],
'news_source': [],
'content': []
})
params = {'api-key': key,
'begin_date': begin_date,
'end_date': end_date}
for key, value in params.items():
url += key + '=' + value + '&'
for page in range(pages):
page_url = url + 'page' + '=' + str(page)
json_response = requests.get(page_url).json()
if json_response['status'] == 'OK':
for article in json_response['response']['docs']:
web_url = article['web_url']
full_content = get_article_content(web_url)
url_image = get_article_image(web_url)
df_articles = df_articles.append({'url': web_url,
'url_image': url_image,
'article': article['headline']['main'],
'datetime': article['pub_date'],
'news_source': article['source'],
'content': full_content}, ignore_index=True)
return df_articles
def get_article_content(url):
url = url.replace('\\/', '/')
page = requests.get(url)
try:
page.raise_for_status()
except Exception as exc:
print('Problem downloading: %s' % exc)
soup = BeautifulSoup(page.text, 'html.parser')
content_pieces = soup.findAll('div', {'class': 'StoryBodyCompanionColumn'})
content = []
for piece in content_pieces:
cleaned_content = re.sub('[^0-9a-zA-Z]+', ' ', piece.text).strip()
content.append(cleaned_content)
return ' '.join(content)
def get_article_image(url):
url = url.replace('\\/', '/')
page = requests.get(url)
try:
page.raise_for_status()
except Exception as exc:
print('Problem downloading: %s' % exc)
soup = BeautifulSoup(page.text, 'html.parser')
url_image = soup.find('img').get('src')
return url_image
def get_article_dicts(df_articles):
article_content_dicts = []
for index, row in df_articles.iterrows():
article_content_dict = {}
article_content_dict['url'] = row['url']
article_content_dict['content'] = row['content']
article_content_dict['article'] = row['article']
article_content_dicts.append(article_content_dict)
return article_content_dicts
def get_articles_coordinates(cache):
from pprint import pprint
end_date = datetime.date.today()
start_date = end_date - datetime.timedelta(days=7)
start_date = start_date.strftime('%Y%m%d')
end_date = end_date.strftime('%Y%m%d')
# Get n pages from the past week from NYTimes API
df_articles = retrieve_article_data(start_date, end_date, 1, '64421d7edeab4ab9ad0ceea49bfcef03')
articles_contents_list = get_article_dicts(df_articles)
locations_included_list = insert_location_into_articles(articles_contents_list, cache)
for entry in locations_included_list:
entry['sentiment'] = get_sentiment(entry.get('url', ''))
print(entry.get('sentiment', 0))
ret = defaultdict(list)
for article in locations_included_list:
for location in article['location']:
if location == None:
continue
pic_link = 'https://texasperformingarts.org/sites/files/tpa/tpa_news_images/new-york-times-logo.jpg'
ret[str(location.get('coordinates').get('lng'))+','+str(location.get('coordinates').get('lat'))].append({
'url': article.get('url', ''),
'title': article.get('article', ''),
'pic': pic_link,
'sentiment': article.get('sentiment', 0)
})
return ret
def get_dict_to_csv(json_results):
# df = pd.read_json('')
articles_coords_dict = json_results
csv_body_lines = set()
for coord in articles_coords_dict:
for article in articles_coords_dict[coord]:
csv_body = coord
csv_body += ','
csv_body += ','.join([article.get('url', ''), '\"' + article.get('title', '') + '\"', article.get('pic', ''), str(article.get('sentiment', 0.0))])
csv_body_lines.add(csv_body)
# csv_body += '\n'
return '\n'.join(csv_body_lines)
if __name__ == '__main__':
end_date = datetime.date.today()
start_date = end_date - datetime.timedelta(days=7)
start_date = start_date.strftime('%Y%m%d')
end_date = end_date.strftime('%Y%m%d')
df_articles = retrieve_article_data(start_date, end_date, 1, '64421d7edeab4ab9ad0ceea49bfcef03')
import numpy as np
print(np.array(df_articles.head(5)))