-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
330 lines (284 loc) · 10.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
import os
import csv
from dotenv import load_dotenv
import requests
from transformers import pipeline
import time
from scrapper import scrape_interpol_red_notices
import sys
news_outlets = [
"The New York Times",
"The Washington Post",
"BBC News",
"CNN",
"Reuters",
"Al Jazeera",
"The Guardian",
"Bloomberg",
"NPR",
"Fox News",
"The Wall Street Journal",
"USA Today",
"The Independent",
"HuffPost",
"Los Angeles Times",
"ABC News",
"The Atlantic",
"Politico",
"Time",
"The Economist",
"CBS News",
"Financial Times",
"The Telegraph",
"The Times",
"Slate",
"Newsweek",
"ProPublica",
"Vox",
"The Hill",
"Business Insider",
"Politifact",
"Mashable",
"The New Yorker",
"Vice News",
"Axios",
"The Associated Press",
"The Daily Beast",
"The Spectator",
"The Chicago Tribune",
"Wired",
"The Sun",
"Express Tribune",
"Dawn",
"L'Express",
"The Globe and Mail",
"CBC News",
"The Australian",
"The Age",
"The Sydney Morning Herald",
"Financial Post",
"The National Post",
"The Independent (UK)",
"The Irish Times",
"The Times of India",
"India Today",
"The Hindu",
"Dhaka Tribune",
"The Straits Times",
"South China Morning Post",
"The Japan Times",
"Asahi Shimbun",
"The Moscow Times",
"Russia Today (RT)",
"Sofia Globe",
"Al Arabiya",
"Arab News",
"Haaretz",
"The Jerusalem Post",
"Euronews",
"Deutsche Welle (DW)",
"RTÉ News",
"Sky News",
"Channel News Asia (CNA)",
"Zee News",
"PTI",
"The Economic Times",
"Forbes",
"Fortune",
"The Wrap",
"The Verge",
"TechCrunch",
"Gizmodo",
"Engadget",
"CNET",
"Mashable",
"The Fader",
"Billboard",
"Variety",
"Rolling Stone",
"Pitchfork",
"NME",
"MTV News",
"The Ringer",
"Bleacher Report",
"ESPN",
"Sports Illustrated",
"WWE",
"The Athletic",
"Golf Digest",
"Outdoor Life",
"Men's Health"
]
load_dotenv()
# Your NewsAPI key
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
# Your CSE API key and CSE ID
CSE_API_KEY = os.getenv("CSE_API_KEY")
CSE_ID = os.getenv("CSE_ID")
SCRAPEOPS_API_KEY = os.getenv("SCRAPEOPS_API_KEY")
# Initialize the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")
def fetch_news_about_person(person_name):
url = 'https://newsapi.org/v2/everything'
exact_person_name = f'"{person_name}"' # Exact match for the person's name
# Enhance the search query with relevant keywords
keywords = 'discuss OR interview OR coverage OR profile'
# Refine the query to ensure we're getting news articles
params = {
'q': f'{exact_person_name} AND ({keywords}) -wikipedia -twitter -reddit -blog -forum',
'apiKey': NEWS_API_KEY,
'language': 'en',
'sortBy': 'relevancy',
'pageSize': 100,
}
response = requests.get(url, params=params)
if response.status_code == 200:
articles = response.json().get('articles', [])
if articles:
print(f"\nNews articles about {person_name} from NewsAPI:")
fetched_urls = {article['url'] for article in articles}
all_articles = []
for i, article in enumerate(articles, 1):
article_title = article.get('title', '')
article_description = article.get('description', '')
if person_name.lower() in article_title.lower() or person_name.lower() in article_description.lower():
article_info = print_article_info(article, i)
if article_info: # Only append if article_info is not None
all_articles.append(article_info)
else:
print(f"Article '{article_title}' is not relevant, skipping.")
fetch_articles_using_cse(exact_person_name, fetched_urls, all_articles)
save_articles_to_csv(all_articles, person_name)
else:
print(f"No articles found for {person_name}.")
else:
print(f"Failed to fetch news articles from NewsAPI: {response.status_code} - {response.text}")
def fetch_articles_using_cse(person_name, existing_urls, all_articles):
cse_url = 'https://www.googleapis.com/customsearch/v1'
news_outlets_query = ' OR '.join(f'"{outlet}"' for outlet in news_outlets)
# Refine the query to include relevant terms for news articles
refined_query = f'{person_name} AND ({news_outlets_query}) AND (news OR report OR article) -wikipedia -twitter -reddit -blog -forum -site:linkedin.com -YouTube'
cse_params = {
'key': CSE_API_KEY,
'cx': CSE_ID,
'q': refined_query,
'num': 10 # Adjust this number to fetch more articles if needed
}
cse_response = requests.get(cse_url, params=cse_params)
if cse_response.status_code == 200:
cse_articles = cse_response.json().get('items', [])
if cse_articles:
print(f"\nNews articles about {person_name} from Custom Search Engine:")
for i, article in enumerate(cse_articles, 1):
article_url = article.get('link', None)
if article_url and article_url not in existing_urls:
article_info = print_article_info(article, i, is_cse=True)
all_articles.append(article_info) # Store article info for CSV
else:
print(f"Article '{article.get('title')}' is already fetched from NewsAPI, skipping.")
else:
print(f"No additional articles found for {person_name} in Custom Search Engine.")
else:
print(f"Failed to fetch news articles from CSE: {cse_response.status_code} - {cse_response.text}")
def print_article_info(article, index, is_cse=False):
source = article['source']['name'] if not is_cse else "CSE"
title = article['title']
url = article.get('link', None)
description = article.get('snippet', '') if is_cse else article.get('description', '')
content = article.get('content', '')
# Analyze sentiment using BERT
title_sentiment = analyze_sentiment(title)
description_sentiment = analyze_sentiment(description)
content_sentiment = analyze_sentiment(content)
if not content:
content_sentiment = 0
aggregated_sentiment = aggregate_sentiment(title_sentiment, description_sentiment, content_sentiment)
# Convert aggregated sentiment to words
aggregated_sentiment_word = sentiment_to_words(aggregated_sentiment)
print(f"{index}. {title} (Source: {source})")
print(f" URL: {url}")
print(f" Description: {description}")
print(f" Aggregated Sentiment: {aggregated_sentiment} ({aggregated_sentiment_word})\n")
return {
'title': title,
'url': url,
'source': source,
'description': description,
'content': content,
'title_sentiment': title_sentiment,
'description_sentiment': description_sentiment,
'content_sentiment': content_sentiment,
'aggregated_sentiment': aggregated_sentiment,
'aggregated_sentiment_word': aggregated_sentiment_word
}
def analyze_sentiment(text):
"""Analyze sentiment of the given text using BERT."""
if text:
result = sentiment_analyzer(text)[0]
sentiment_label = result['label']
sentiment_score = result['score']
# Convert sentiment labels to a polarity scale
if sentiment_label == 'POSITIVE':
return sentiment_score # Positive score
elif sentiment_label == 'NEGATIVE':
return -sentiment_score # Negative score
else:
return 0 # Neutral
return 0 # Neutral if text is empty
def aggregate_sentiment(title_sentiment, description_sentiment, content_sentiment):
"""Calculate the average sentiment from title, description, and content."""
count = 3
if content_sentiment == 0:
count = 2
total_sentiment = title_sentiment + description_sentiment + content_sentiment
return total_sentiment / count
def save_articles_to_csv(articles, person_name):
os.makedirs('output', exist_ok=True)
# Create a CSV file named after the person
file_path = f'output/{person_name.replace(" ", "_")}_articles.csv'
with open(file_path, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=[
'title', 'url', 'source', 'description', 'content',
'title_sentiment', 'description_sentiment', 'content_sentiment',
'aggregated_sentiment', 'aggregated_sentiment_word'
])
writer.writeheader()
writer.writerows(articles)
print(f"Articles saved to {file_path}")
def sentiment_to_words(sentiment):
"""Convert sentiment polarity to descriptive words."""
if sentiment > 0.5:
return "Very Positive"
elif sentiment > 0:
return "Positive"
elif sentiment == 0:
return "Neutral"
elif sentiment > -0.5:
return "Negative"
else:
return "Very Negative"
def check_name_in_csv(person_name, csv_filename='output/red_notices.csv'):
try:
with open(csv_filename, mode='r', encoding='utf-8') as file:
reader = csv.DictReader(file)
# Iterate over each row in the CSV
for row in reader:
# Remove any extra spaces and join the lines inside the 'name' field
csv_name = " ".join(row['name'].splitlines()).strip().lower()
if csv_name == person_name.strip().lower():
print(f"{person_name} is found in the Red Notices.")
return True
print(f"{person_name} is not found in the Red Notices.")
os.remove(csv_filename)
return False
except FileNotFoundError:
print(f"The file {csv_filename} does not exist.")
return False
if __name__ == "__main__":
if len(sys.argv) > 1:
person_name = sys.argv[1]
else:
person_name = input("Enter the name of the person to search for: ")
fetch_news_about_person(person_name)
scrape_interpol_red_notices()
check_name_in_csv(person_name)