-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtop_10_week.py
222 lines (163 loc) · 6.13 KB
/
top_10_week.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import praw #reddit api library
import pandas as pd
import config
import time
import datetime
import csv
# connects to reddit praw object using the reddit application api client id and client secret
def reddit_object():
reddit = praw.Reddit(client_id = config.client_id,
client_secret = config.client_secret,
user_agent = config.user_agent,
username = config.username,
password = config.password)
return reddit
#Calls up to the top 12 weekly submissions from r/technology
def scrape_submissions(reddit):
sub_list = []
# selects the subreddit technology. This can be changed to any subreddit
subreddit = reddit.subreddit('technology')
# have it currently calling 15 submissions at a time, but can be any number between 1 and 1000
for submission in subreddit.top('week', limit=15):
# different submission attributes I choose to pull are below
# look at the praw documentation for other submission attributes
title = submission.title
pk = submission.id
direction = submission.url
sub_list.append([pk, title, direction])
df = pd.DataFrame(sub_list, columns=['id', 'title', 'direction'])
# removes reddit links in direction column
df = df[~df.direction.str.contains("reddit.com")]
return df
#writes dataframe(df) to csv file
def new_submissions(df):
top10 = 'top10weekly.csv'
# Creates new csv file
with open(top10, 'w', newline='') as file:
writer = csv.writer(file)
#creates header columns
writer.writerow(["", "id", "direction", "title"])
writer.writerow(["", "", "", ""])
# pulls full csv
df_current = pd.read_csv('top10weekly.csv', index_col=0)
# Checks for only the new rows in the df
new_submission = df[~df['id'].isin(df_current['id'])]
new_sub_list = new_submission.values.tolist()
# Appends the new submissions to the current pandas df
df_current = df_current.append(new_submission, sort=False)
# saves new version of the csv
df_current.to_csv('top10weekly.csv')
return new_sub_list, new_submission, df_current
#takes top 5 data and appends list into html file
def to_html_list(df_current):
#reads csv we just saved and stores as df
df_current1 = pd.read_csv('top10weekly.csv', index_col=0)
#create variable to store title and link for each row/article, later we will append these into html
alink1 = df_current1.iloc[1, 1]
akit1 = df_current1.iloc[1, 2]
alink2 = df_current1.iloc[2, 1]
akit2 = df_current1.iloc[2, 2]
alink3 = df_current1.iloc[3, 1]
akit3 = df_current1.iloc[3, 2]
alink4 = df_current1.iloc[4, 1]
akit4 = df_current1.iloc[4, 2]
alink5 = df_current1.iloc[5, 1]
akit5 = df_current1.iloc[5, 2]
#close csv after we stored variables
df_current1.to_csv('top10weekly.csv')
#prepare to append/wrap variables to html
kit_info = {
'link1': alink1,
'kit1': akit1,
'link2': alink2,
'kit2': akit2,
'link3': alink3,
'kit3': akit3,
'link4': alink4,
'kit4': akit4,
'link5': alink5,
'kit5': akit5,
}
#wrapper is the base html that will be unchanged and rewritten into html index
wrapper ="""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="">
<meta name="author" content="">
<title>TechLine</title>
<link rel="shortcut icon" href="favicons.ico" />
<!-- Bootstrap core CSS -->
<link href="vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="custom.css" rel="stylesheet">
</head>
<body>
<!-- Navigation -->
<nav class="navbar navbar-expand-lg navbar-dark bg-company-black static-top">
<div class="container">
<a class="navbar-brand" href="#">
<img src="techlinelogo1.png" alt="">
</a>
</div>
</nav>
<!-- Page Content -->
<div class="container">
<div class="row">
<div class="col-md-9 text-center">
<h1>Breaking Tech News</h1>
<div class="embed-responsive embed-responsive-1by1">
<iframe class="embed-responsive-item" style="border-style: none;" src="https://jesusyanez.github.io/Reddit-scraper-to-html-table/" height="1000" width="800" ></iframe>
</div>
</div>
<div class="col-md-3 text-center">
<br>
<h2>
<u>Weekly Scoop</u>
</h2>
<ol class="text-left">
<li>
<a href="{link1}" target="_blank">{kit1}</a>
</li>
<li>
<a href="{link2}" target="_blank">{kit2}</a>
</li>
<li>
<a href="{link3}" target="_blank">{kit3}</a>
</li>
<li>
<a href="{link4}" target="_blank">{kit4}</a>
</li>
<li>
<a href="{link5}" target="_blank">{kit5}</a>
</li>
</ol>
<br/>
</div>
<div class="col-lg-12 text-center">
<p class="lead">TechLine | Jesus Yanez</p>
<ul class="list-unstyled">
</ul>
</div>
</div>
</div>
<!-- Bootstrap core JavaScript -->
<script src="vendor/jquery/jquery.slim.min.js"></script>
<script src="vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
</body>
</html>"""
#creates new html text with stored variables
new_html = wrapper.format(**kit_info)
#opens/creates index file and writes it
with open("index.html", "w", encoding='utf-8') as file:
file.write(new_html)
return df_current1
def main():
reddit = reddit_object()
df = scrape_submissions(reddit)
new_sub_list, new_submission, df_current = new_submissions(df)
df_current1 = to_html_list(df_current)
print("new submissions: ", new_submission.shape)
print("Current Dataframe: ", df_current.shape)
main()