-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathjobs.py
280 lines (242 loc) · 10.5 KB
/
jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
# Author: Mihir Shrestha
from geopy.exc import GeocoderTimedOut
from requests import get
from time import sleep
from geopy.geocoders import Nominatim
import json
import sqlite3
from typing import Tuple, List, Dict
import feedparser
import ssl
from bs4 import BeautifulSoup
from dateutil import parser
import pymongo
# Main function that calls the retrieve_jobs(), open_db(), create_table_jobs(),
# save_to_database(), and dump_data() functions.
def main():
# nice job on sprint one - benign comment to test github actions
WRITE_TO_FILE = True
UPDATE_DATABASE = True
DUMP_TO_MONGO = False # Only set to True if you have a Mongo Server running, or you will receive a timeoutError.
githubJobs = retrieve_jobs() # these are from github
stackOverFlowjobs = retrieve_stack_over_flow_jobs() # these are from stackOverFlow
totalJobs = githubJobs + stackOverFlowjobs
if UPDATE_DATABASE:
databaseName = 'jobs.db'
connection, cursor = open_db(databaseName)
create_table(connection, cursor)
save_to_database(totalJobs, connection, cursor)
close_db(connection)
if WRITE_TO_FILE:
fileName = "json.txt"
dump_data(totalJobs, fileName)
if DUMP_TO_MONGO:
try:
write_to_mongo(totalJobs)
except pymongo.errors.ServerSelectionTimeoutError:
print("No MongoDB service found. Please start one.")
def return_geo_location(geolocator, location: str):
found = False
failCounter = 0
geoLocation = None
if not location or 'remote' in location.lower():
return None, None
while not found and failCounter < 10:
try:
if location.strip()[-2:] == "CA":
geoLocation = geolocator.geocode(location, timeout=1, country_codes=["US"])
else:
geoLocation = geolocator.geocode(location, timeout=1)
found = True
except GeocoderTimedOut:
failCounter += 1
continue
if not geoLocation:
return None, None
print("Just retrieved geo-information for {}.".format(location))
return geoLocation.latitude, geoLocation.longitude
# Function that retrieves the jobs from GitHub. It has a sleep function implemented,
# so that the program doesn't shoot all the requests at once to GitHub. It also checks
# if it received a 200 response code or not and reports the code back to the user if it is not 200.
# UPDATE: It is now not hard-coded to go up to 5 pages, it keeps looping through until the number
# of jobs is less than 50. Also, a new failCounter has been implemented to reduce the risks for
# an infinite loop. With the failCounter, if one page fails 3 times, then all the following pages' data will
# also not be retrieved.
def retrieve_jobs() -> List[Dict]:
missedList = []
jsonData = []
endOfJobs = False
num = 1
failCounter = 0
# The failCounter is implemented because if GitHub is down or the page does not exist,
# the loop would never break.
while not endOfJobs:
url = "https://jobs.github.com/positions.json?page=" + str(num)
response = get(url)
if response.status_code == 200:
print("Received response: {} for page {}.".format(response.status_code, num))
newJsonData = response.json()
jsonData += newJsonData
if len(newJsonData) < 50:
endOfJobs = True
if num in missedList:
missedList.remove(num)
failCounter = 0
else:
print("Received response: {} for page {}. Trying again...".format(response.status_code, num))
if num not in missedList:
missedList.append(num)
failCounter += 1
if failCounter == 3:
break
# Code will try to get data from the page again for 3 attempts.
# It first sleeps again for a half of a second to give GitHub
# an idea that we are friendly.
# If it fails 3 times, it finally breaks out of the loop.
sleep(.5)
continue
sleep(.5)
num += 1
if len(missedList) > 0:
print("Missed data for page {} and potentially other following pages.".format(str(missedList)[1:-1]))
else:
print("Successfully retrieved data from all GitHub pages.")
return jsonData
def write_to_mongo(listOfJobs):
client = pymongo.MongoClient("mongodb://localhost:27017/")
database = client["database"]
collection = database['jobs']
for job in listOfJobs:
try:
job["_id"] = job["id"]
del job["id"]
collection.insert_one(job)
except pymongo.errors.DuplicateKeyError:
pass
print("Successfully dumped data to MongoDB.")
def retrieve_stack_over_flow_jobs():
url = "https://stackoverflow.com/jobs/feed"
# Found the conditional online. Parsing does not work if you don't run the SSL command.
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
feed = feedparser.parse(url)
totalData = []
totalDict = {}
for entry in feed.entries:
totalDict["id"] = entry['id']
totalDict["type"] = None
totalDict["url"] = entry["link"]
totalDict["created_at"] = entry["published"]
totalDict["company"] = entry["author"]
totalDict["company_url"] = None
totalDict["title"] = entry["title"]
totalDict["description"] = entry["summary"]
totalDict["how_to_apply"] = None
totalDict["company_logo"] = None
try:
totalDict["location"] = entry["location"]
except KeyError:
totalDict["location"] = None
totalData.append(totalDict)
totalDict = {}
print("Successfully retrieved {} entries from StackOverFlow".format(len(feed.entries)))
return totalData
# Simple function that dumps JSON data to a .txt file.
def dump_data(jobs: list, file_name: str):
if not (type(jobs) is list or type(jobs) is dict):
print("Illegal type of data. {} is of type {}. Please enter a list or dictionary.".format(jobs, type(jobs)))
return None
if type(jobs) is dict:
jobs = [jobs]
with open(file_name, 'w') as openFile:
for job in jobs:
json.dump(job, openFile)
print("Successfully dumped JSON data to {}.".format(file_name))
def check_if_cache_exists_and_return_geo(cursor, id, location, geolocator):
cursor.execute("SELECT * FROM jobs WHERE jobs.id = ?", (id,))
if cursor.fetchone() is not None:
return None, None
cursor.execute("SELECT jobs.geo_latitude, jobs.geo_longitude FROM jobs WHERE jobs.location = ?",
(location,))
cursorResult = cursor.fetchone()
if cursorResult is not None:
print("Just retrieved cached geo-information for {}.".format(location))
return cursorResult
else:
return return_geo_location(geolocator, location)
def strip_html(desc, how_to_apply):
if desc is not None:
soup = BeautifulSoup(desc, features="html.parser")
desc = soup.get_text()
if how_to_apply is not None:
soup = BeautifulSoup(how_to_apply, features="html.parser")
how_to_apply = soup.get_text()
return desc, how_to_apply
# Simple function that dumps data to its corresponding column in the database.
# UPDATE: The function also calls the return_geo_location function, strips HTML content
# out of description and how to apply columns, and parses date appropriately for SQL.
def save_to_database(jobs: list, connection: sqlite3.Connection, cursor: sqlite3.Cursor):
if not (type(jobs) is list or type(jobs) is dict):
print("Illegal type of data. {} is of type {}. Please enter a list or dictionary.".format(jobs, type(jobs)))
return None
if type(jobs) is dict:
jobs = [jobs]
geolocator = Nominatim(user_agent="jobsRetriever")
for job in jobs:
if len(job) != 11:
print("Incorrect number of arguments. Insertion failed.")
return None
geolocation = check_if_cache_exists_and_return_geo(cursor, job['id'], job['location'], geolocator)
job['description'], job['how_to_apply'] = strip_html(job['description'], job['how_to_apply'])
try:
cursor.execute("INSERT INTO jobs VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);",
[job['id'],
job['type'],
job['url'],
parser.parse(job['created_at']).strftime("%Y-%m-%d"),
job['company'],
job['company_url'],
job['location'],
job['title'],
job['description'],
job['how_to_apply'],
job['company_logo'],
geolocation[0],
geolocation[1]
])
commit_db(connection)
except sqlite3.IntegrityError:
pass
# print("Data already exists in the table.")
# Simple function that creates a connection with a filename given and returns a connection and cursor.
def open_db(filename: str) -> Tuple[sqlite3.Connection, sqlite3.Cursor]:
connection = sqlite3.connect(filename) # Connects to an existing database or creates a new one.
cursor = connection.cursor() # We are now ready to read/write data.
return connection, cursor
# Simple function that closes a database.
def close_db(connection: sqlite3.Connection):
connection.commit()
connection.close()
# Simple function that commits changes to a database.
def commit_db(connection: sqlite3.Connection):
connection.commit()
# Simple function that creates the initial table jobs with its respective columns.
def create_table(connection: sqlite3.Connection, cursor: sqlite3.Cursor):
cursor.execute('''CREATE TABLE IF NOT EXISTS jobs(
id TEXT PRIMARY KEY,
Position_Type TEXT,
URL TEXT NOT NULL,
Created_at TEXT NOT NULL,
Company TEXT NOT NULL,
Company_URL TEXT,
Location TEXT,
Title TEXT NOT NULL,
Description TEXT NOT NULL,
How_To_Apply TEXT,
Company_Logo TEXT,
geo_latitude TEXT,
geo_longitude TEXT
);''')
commit_db(connection)
if __name__ == "__main__":
main()