-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper discursos.py
44 lines (35 loc) · 1.2 KB
/
scraper discursos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#coding = utf-8
import urllib
import lxml.html
import csv
import unicodedata
data =[]
titulo = []
link = []
connection = urllib.urlopen('http://www2.planalto.gov.br/acompanhe-o-planalto/discursos/discursos-da-presidenta?b_start:int=900')
dom = lxml.html.fromstring(connection.read())
for c in dom.xpath('//span[@class="summary-view-icon"]/text()'):
data.append(c)
#for row in data: # select the url in href for all a tags(links)
#row[0] = data
#writer.writerow(data)
#titulo
for d in dom.xpath('//a[@class="summary url"]/text()'):# select the url in href for all a tags(links)
titulo.append(
unicodedata.normalize('NFKD', d).encode('ascii', 'ignore')
)
#links
for e in dom.xpath('//h2[@class="tileHeadline"]//a/@href'):# select the url in href for all a tags(links)
link.append(e)
data = data[1::6]
print len(titulo)
print len(link)
nova = [item.strip() for item in data]
print len(nova)
with open('discurso1.csv', 'wb') as discurso_csv:
discursowriter = csv.writer(discurso_csv, delimiter=' ',
quotechar='|', quoting=csv.QUOTE_MINIMAL)
discursowriter.writerow(data)
discursowriter.writerow(titulo)
discursowriter.writerow(link)
#csv.close()