-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopbeers_prefect.py
82 lines (65 loc) · 2.5 KB
/
topbeers_prefect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from bs4 import BeautifulSoup
import requests
from prefect import task, Flow
from prefect.executors import LocalDaskExecutor
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36'
}
base_url = 'https://beeradvocate.com'
top_url = base_url + '/lists/top/'
def get_page(url):
session = requests.Session()
response = session.get(url, headers=headers)
return response
@task
def get_table_rows(url):
response = get_page(url)
page = BeautifulSoup(response.text, "html.parser")
table_rows = page.findAll('tr')
rows = table_rows[1:]
return rows
@task
def get_beers(table_rows):
top_beers = []
for row in table_rows:
cells = row.findAll('td')
a_tags = cells[1].findAll('a')
beer = {}
beer['rank'] = cells[0].text
beer['name'] = a_tags[0].text
beer['page'] = a_tags[0].get('href')
beer['brewer_name'] = a_tags[1].text
beer['brewer_page'] = a_tags[1].get('href')
beer['brewer_id'] = beer['brewer_page'].split("/")[-2]
beer['style'] = a_tags[2].text
beer['ratings'] = cells[2].text
beer['avg_rating'] = cells[3].text
abv = cells[1].text.rsplit(" | ")
if len(abv) > 1:
beer['abv'] = abv[-1]
else:
beer['abv'] = ""
top_beers.append(beer)
return top_beers
@task
def get_brewer(beer):
burl = base_url + beer['brewer_page']
brewer_page = get_page(burl)
bpage = BeautifulSoup(brewer_page.text, 'html.parser')
beer['brewer_state'] = bpage.find_all(id="info_box")[0].find_all('a')[1].text
beer['brewer_avg_rating'] = bpage.findAll(id="stats_box")[0].findAll('dd')[0].text
return beer
@task
def print_chart(top_beers):
print( "| Rank | Beer | Brewer | State | Style | ABV | AVG Rating | Ratings | Brewer AVG |")
print( "| --- | --- | --- | --- | --- | --- | --- | --- | --- |")
for beer in top_beers:
print( "| " + beer['rank'] + " | " + beer['name'] + " | " + beer['brewer_name'] + " | " + beer['brewer_state'] + " | " + beer['style'] + " | " + beer['abv'] + " | " + beer['avg_rating'] + " | " + beer['ratings'] + " | " + beer['brewer_avg_rating'] + " |")
with Flow("top-beers") as flow:
rows = get_table_rows(top_url)
top_beers = get_beers(rows)
# beers_n_brewers = get_brewer(top_beers)
beers_n_brewers = get_brewer.map(top_beers)
print_chart(beers_n_brewers)
flow.executor = LocalDaskExecutor()
flow.run()