-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
426 lines (350 loc) · 17.2 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
"""
source ~/desktop/project/pyscrapeEnv/bin/activate
sudo xcode-select --switch /Applications/Xcode.app
/Applications/Xcode.app
python3 app.py
"""
from bs4 import BeautifulSoup
import requests, lxml, os, json
from parsel import Selector
# requestHandler()
import random
# selenium_stealth
from selenium import webdriver
from selenium_stealth import stealth
import time
# Proxy Rotation
from fp.fp import FreeProxy
# For CSV parsing
import pandas
# For filtering venue names
import re
# For selenium web scrape tutorial
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#import org.openqa.selenium.support.ui.Select;
from selenium.webdriver.support.select import Select
#import org.openqa.selenium.By;
def scrape_one_google_scholar_page():
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
# https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
params = {
'q': 'microservices', # search query
'hl': 'en' # language of the search
}
html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
soup = BeautifulSoup(html, 'lxml')
# JSON data will be collected here
data = []
# Container where all needed data is located
for result in soup.select('.gs_r.gs_or.gs_scl'):
title = result.select_one('.gs_rt').text
title_link = result.select_one('.gs_rt a')['href']
publication_info = result.select_one('.gs_a').text
snippet = result.select_one('.gs_rs').text
cited_by = result.select_one('#gs_res_ccl_mid .gs_nph+ a')['href']
#"""
related_articles_URL = result.select_one('a:nth-child(4)')['href']
related_articles_txt = result.select_one('a:nth-child(4)').text
all_article_versions_URL = result.select_one('.gs_nph:nth-child(5)')['href']
all_article_versions_txt = result.select_one('.gs_nph:nth-child(5)').text
#"""
try:
pdf_link = result.select_one('.gs_or_ggsm a:nth-child(1)')['href']
except:
pdf_link = None
data.append({
'title': title,
'title_link': title_link,
'publication_info': publication_info,
'snippet': snippet,
'cited_by': f'https://scholar.google.com{cited_by}',
#"""
'related_articles_URL': f'https://scholar.google.com{related_articles_URL}',
#a:nth-child(4)
'all_article_versions_URL': f'https://scholar.google.com{all_article_versions_URL}',
'all_article_versions_txt': all_article_versions_txt,
#.gs_nph:nth-child(5)
#"""
"pdf_link": pdf_link
})
print(data)
print(json.dumps(data, indent = 2, ensure_ascii = False))
# Part of the JSON Output:
'''
[
{
"title": "“What? I thought Samsung was Japanese”: accurate or not, perceived country of origin matters",
"title_link": "https://www.emerald.com/insight/content/doi/10.1108/02651331111167589/full/html",
"publication_info": "P Magnusson, SA Westjohn… - International Marketing …, 2011 - emerald.com",
"snippet": "Purpose–Extensive research has shown that country‐of‐origin (COO) information significantly affects product evaluations and buying behavior. Yet recently, a competing perspective has emerged suggesting that COO effects have been inflated in prior research …",
"cited_by": "https://scholar.google.com/scholar?cites=341074171610121811&as_sdt=2005&sciodt=0,5&hl=en",
"related_articles": "https://scholar.google.com/scholar?q=related:U8bh6Ca9uwQJ:scholar.google.com/&scioq=samsung&hl=en&as_sdt=0,5",
"all_article_versions": "https://scholar.google.com/scholar?cluster=341074171610121811&hl=en&as_sdt=0,5"
}
]
'''
def google_scholar_pagination():
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
# https://requests.readthedocs.io/en/latest/user/quickstart/#passing-parameters-in-urls
params = {
'q': 'samsung medical center seoul semiconductor element simulation x-ray fetch',
'hl': 'en', # language of the search
'start': 0 # page number ⚠
}
# JSON data will be collected here
data = []
while True:
html = requests.get('https://scholar.google.com/scholar', headers=headers, params=params).text
selector = Selector(text=html)
print(f'extrecting {params["start"] + 10} page...')
# Container where all needed data is located
for result in selector.css('.gs_r.gs_or.gs_scl'):
title = result.css('.gs_rt').xpath('normalize-space()').get()
title_link = result.css('.gs_rt a::attr(href)').get()
publication_info = result.css('.gs_a').xpath('normalize-space()').get()
snippet = result.css('.gs_rs').xpath('normalize-space()').get()
cited_by_link = result.css('.gs_or_btn.gs_nph+ a::attr(href)').get()
data.append({
'page_num': params['start'] + 10, # 0 -> 1 page. 70 in the output = 7th page
'title': title,
'title_link': title_link,
'publication_info': publication_info,
'snippet': snippet,
'cited_by_link': f'https://scholar.google.com{cited_by_link}',
})
# check if the "next" button is present
if selector.css('.gs_ico_nav_next').get():
params['start'] += 10
else:
break
print(json.dumps(data, indent = 2, ensure_ascii = False))
google_scholar_pagination()
# Part of the output:
'''
extrecting 10 page...
extrecting 20 page...
extrecting 30 page...
extrecting 40 page...
extrecting 50 page...
extrecting 60 page...
extrecting 70 page...
extrecting 80 page...
extrecting 90 page...
[
{
"page_num": 10,
"title": "Comparative analysis of root canal filling debris and smear layer removal efficacy using various root canal activation systems during endodontic retreatment",
"title_link": "https://www.mdpi.com/891414",
"publication_info": "SY Park, MK Kang, HW Choi, WJ Shon - Medicina, 2020 - mdpi.com",
"snippet": "… According to a recent study, the GentleWave System was effective in retrieving separated … Energy dispersive X-ray spectroscopy (EDX) may be used for the microchemical analysis of …",
"cited_by_link": "https://scholar.google.com/scholar?cites=5221326408196954356&as_sdt=2005&sciodt=0,5&hl=en"
},
{
"page_num": 90,
"title": "Αυτόματη δημιουργία ερωτήσεων/ασκήσεων για εκπαιδευτικό σύστημα διδασκαλίας τεχνητής νοημοσύνης",
"title_link": "http://nemertes.lis.upatras.gr/jspui/handle/10889/9424",
"publication_info": "Ν Νταλιακούρας - 2016 - nemertes.lis.upatras.gr",
"snippet": "Στόχος της διπλωματικής είναι ο σχεδιασμός ,η ανάπτυξη και υλοποίηση ενός συστήματος παραγωγής ερωτήσεων/ασκήσεων από κείμενα φυσικής γλώσσας. Κύριος στόχος των …",
"cited_by_link": "https://scholar.google.com/scholar?q=related:1ovrKI-7xtUJ:scholar.google.com/&scioq=samsung+medical+center+seoul+semiconductor+element+simulation+x-ray+fetch&hl=en&as_sdt=0,5",
}
]
'''
def requestHandler(URL,params):
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
#Pick a random user agent
user_agent_chosen = random.randint(0,4)
#Set the headers
headers = {'User-Agent': user_agent_list[user_agent_chosen]}
print("Agent " + str(user_agent_chosen) + " is chosen")
# selenium_stealth code
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
# options.add_argument("--headless")
# Get a newest proxy server that responds within 1 second
proxy = FreeProxy(timeout=4).get()
options.add_argument(f'--proxy-server={proxy}')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#Webdriver = webdriver.Chrome(options=options, executable_path=r"/Users/so/Desktop/Project/ChromeDriverDir/chromedriver")
Webdriver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
""" stealth(driverVar,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
) """
stealth(
Webdriver,
user_agent_list[user_agent_chosen],
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
run_on_insecure_origins=False,
)
#url = "https://bot.sannysoft.com/"
Webdriver.get(URL)
#time.sleep(10)
#Webdriver.quit()
#return requests.get(URL, headers=headers, params=params)
return Webdriver
params = {
"q": "microservice", # search query
"hl": "en" # language of the search
}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
'accept-language': 'en-US,en',
"referer": f"https://scholar.google.com/scholar?hl={params['hl']}&q={params['q']}"
}
def parsel_get_cite_ids():
print("parsel_get_cite_ids()")
URL = "https://scholar.google.com/scholar"
#URL_checks = "https://bot.sannysoft.com/"
#html = requests.get(URL, params=params, headers=headers)
webdriver = requestHandler(URL,params)
print(webdriver.page_source)
#soup = Selector(text=html.text)
soup = Selector(text=webdriver.page_source)
# returns a list of publication ID's -> U8bh6Ca9uwQJ
return soup.css(".gs_r.gs_or.gs_scl::attr(data-cid)").getall()
def parsel_scrape_cite_results():
citations = []
cite_id_example = [[1.00113E+19],[1.73226E+19],[1.30399E+19],[7.17069E+17]]
print("parsel_scrape_cite_results()")
for cite_id in parsel_get_cite_ids():
print(cite_id)
html = requests.get(f"https://scholar.google.com/scholar?output=cite&q=info:{cite_id}:scholar.google.com", headers=headers)
selector = Selector(text=html.text)
# might be issues in the future with extracting data from the table
if selector.css('#gs_citt').get():
for result in selector.css("tr"):
institution = result.xpath("th/text()").get()
citation = result.xpath("td div/text()").get()
citations.append({"institution": institution, "citations": citation})
return citations
def findVenueRank(conf, mimicReal):
if conf:
df = pandas.read_excel('venue.xlsx')
else:
df = pandas.read_excel('venue_scraped.xlsx')
if conf:
thesisType = "conf_"
else:
thesisType = "journal_"
# Reset the 1st index column created when writing to the file twice
df.reset_index(drop=True, inplace=True)
type_select = '#searchform > select:nth-child(2)'
year_select = '#searchform > select:nth-child(3)'
search_select = '#searchform > input[type=text]:nth-child(1)'
result_title_select = '#search > table > tbody > tr.evenrow > td:nth-child(1)'
result_rank_select = '#search > table > tbody > tr.evenrow > td:nth-child(4)'
n = 2
nth_result_title_select = "#search > table > tbody > tr:nth-child(" + str(n) + ") > td:nth-child(1)"
nth_result_rank_select = "#search > table > tbody > tr:nth-child(" + str(n) + ") > td:nth-child(4)"
search_button = '#searchform > input[type=submit]:nth-child(7)'
if conf:
URL = 'http://portal.core.edu.au/conf-ranks/'
else:
URL = 'http://portal.core.edu.au/jnl-ranks/'
options = webdriver.ChromeOptions()
browser = webdriver.Chrome(
options=options,
)
if mimicReal:
browser.get(URL)
else:
params = 0
browser = requestHandler(URL, params)
dropDown = Select(browser.find_element(By.CSS_SELECTOR, type_select)).select_by_visible_text('Title')
dropDown = Select(browser.find_element(By.CSS_SELECTOR, year_select)).select_by_visible_text('All')
for i in range(len(df["venue"])):
print("iteration at index " + str(i))
print("data at index " + str(i) + " = " + str(df.at[i,"venue"]))
if str(df.at[i,"venue"]) == "nan" :
print("the str value at index " + str(i) + " was 'nan' ")
continue
browser.find_element(
By.CSS_SELECTOR, search_select).send_keys(df.at[i,"venue"])
browser.find_element(
By.CSS_SELECTOR, search_button).click()
#WebDriverWait(browser, 2).until(EC.visibility_of_element_located((By.CSS_SELECTOR, result_title_select)))
name_column = thesisType + "result_name"
rank_column = thesisType + "result_rank"
if browser.find_elements(By.CSS_SELECTOR, result_title_select) and browser.find_elements(By.CSS_SELECTOR, result_rank_select):
num_result = browser.find_elements(By.XPATH, ("//*[contains(text(),'Showing results ')]"))
if len(num_result) > 0:
num_result = num_result[0].text
print("num_result = " + str(num_result))
n = 2
print("n = " + str(n))
nth_result_title_select = "#search > table > tbody > tr:nth-child(" + str(n) + ") > td:nth-child(1)"
if conf:
nth_result_rank_select = "#search > table > tbody > tr:nth-child(" + str(n) + ") > td:nth-child(4)"
else:
nth_result_rank_select = "#search > table > tbody > tr:nth-child(" + str(n) + ") > td:nth-child(3)"
title = browser.find_elements(By.CSS_SELECTOR, nth_result_title_select)
if len(title) > 0:
df.at[i,name_column] = title[0].text
print(title[0].text)
rank = browser.find_elements(By.CSS_SELECTOR, nth_result_rank_select)
if len(rank) > 0:
df.at[i,rank_column] = rank[0].text
print(rank[0].text + "\n")
#df.to_excel('venue_scraped.xlsx')
browser.find_element(
By.CSS_SELECTOR, search_select).clear()
""" password_selector = "#password > div.aCsJod.oJeWuf > div > div.Xb9hP > input"
WebDriverWait(browser, 10).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, password_selector)))
browser.find_element(
By.CSS_SELECTOR, password_selector).send_keys(password)
browser.find_element(
By.CSS_SELECTOR, '#passwordNext > div > button > span').click() """
def preFiltering():
df = pandas.read_excel('venue.xlsx')
words = ['2022', '2021', '2020', '2019', '2018', '2017', '2016', '2015']
for i in range(len(df["venue"])):
for w in words:
df.at[i,"venue"] = re.sub(r' \b%s\b ' % w, '', str(df.at[i,"venue"])) # '\b' is a word boundry
df.at[i,"venue"] = re.sub(r'\b%s\b ' % w, '', str(df.at[i,"venue"])) # '\b' is a word boundry
df.at[i,"venue"] = re.sub(r' \b%s\b' % w, '', str(df.at[i,"venue"])) # '\b' is a word boundry
df.at[i,"venue"] = re.sub(r'\b%s\b' % w, '', str(df.at[i,"venue"])) # '\b' is a word boundry
df.to_excel('venue.xlsx')
if i % 100 == 0:
print("iteration = " + str(i))
print(df['venue'])
print(df['venue'])
return True
if __name__ == "__main__":
""" print("function 1")
scrape_one_google_scholar_page()
print("function 2")
google_scholar_pagination()
print(parsel_get_cite_ids())
print(parsel_scrape_cite_results()) """
preFiltering()
mimicReal = True
findVenueRank(True,mimicReal)
mimicReal = True
findVenueRank(False,mimicReal)