-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunmasked3.py
56 lines (51 loc) · 2.24 KB
/
unmasked3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import urllib.request
import sys
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
pagesToGet = 1
upperframe = []
for page in range(1, pagesToGet+1):
print('processing page :', page)
url = 'https://www.politifact.com/factchecks/list/?page='+str(page)
print(url)
# an exception might be thrown, so the code should be in a try-except block
try:
# use the browser to get the url. This is suspicious command that might blow up.
# this might throw an exception if something goes wrong.
page = requests.get(url)
except Exception as e: # this describes what to do if an exception is thrown
# get the exception information
error_type, error_obj, error_info = sys.exc_info()
print('ERROR FOR LINK:', url) # print the link that cause the problem
# print error info and line that threw the exception
print(error_type, 'Line:', error_info.tb_lineno)
continue # ignore this page. Abandon this and go back.
time.sleep(2)
soup = BeautifulSoup(page.text, 'html.parser')
frame = []
links = soup.find_all('li', attrs={'class': 'o-listicle__item'})
print(len(links))
filename = "NEWS.csv"
f = open(filename, "w", encoding='utf-8')
headers = "Statement,Link,Date, Source, Label\n"
f.write(headers)
for j in links:
Statement = j.find(
"div", attrs={'class': 'm-statement__quote'}).text.strip()
Link = "https://www.politifact.com"
Link += j.find("div", attrs={'class': 'm-statement__quote'}
).find('a')['href'].strip()
Date = j.find('div', attrs={
'class': 'm-statement__body'}).find('footer').text[-14:-1].strip()
Source = j.find(
'div', attrs={'class': 'm-statement__meta'}).find('a').text.strip()
Label = j.find('div', attrs={'class': 'm-statement__content'}).find(
'img', attrs={'class': 'c-image__original'}).get('alt').strip()
frame.append(Statement)
f.write(Statement.replace(",", "^")+"\n")
upperframe.extend(frame)
f.close()
data = pd.DataFrame(upperframe, columns=['Statement'])
data.head()