-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathSCRAPER.py
130 lines (116 loc) · 5.56 KB
/
SCRAPER.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from prompt_toolkit.keys import Keys
from selenium import webdriver
from flask_test import *
user_name = 'abc@gmail.com' # enter your email ID
password = 'your_actual_password' # enter your password
see_more_urls = []
movie_urls = []
def scroll(driver, timeout):
director = ""
starring = ""
genres = ""
subtitles = ""
audio_languages = ""
supporting_actors = ""
producers = ""
studio = ""
amr = ""
scroll_pause_time = timeout
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
new_height = driver.execute_script("return document.body.scrollHeight")
break
if new_height == last_height:
break
movie_urls_tags = []
movie_urls_elements = driver.find_elements_by_class_name("av-beard-title-link")
for movie in movie_urls_elements:
movie_urls_tags.append(movie.get_attribute("href"))
for movietag in movie_urls_tags:
driver.get(movietag)
time.sleep(2)
title = driver.find_element_by_class_name('_1GTSsh').text
element = driver.find_element_by_class_name('_3qsVvm')
description = element.text
attributes = driver.find_elements_by_class_name("_1ONDJH")[0].find_elements_by_tag_name("dl")
more_attributes = driver.find_elements_by_class_name("_1ONDJH")[1].find_elements_by_tag_name("dl")
try:
for atr in attributes:
if (atr.find_element_by_tag_name("dt").text == "Director"):
director = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Starring"):
starring = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Genres"):
genres = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Subtitles"):
subtitles = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Audio languages"):
audio_languages = str(atr.find_element_by_tag_name("dd").text)
print("attribute: " + str(atr.find_element_by_tag_name("dt").text) + " value" + str(
atr.find_element_by_tag_name("dd").text))
for atr in more_attributes:
if (atr.find_element_by_tag_name("dt").text == "Producers"):
producers = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Studio"):
studio = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Amazon Maturity Rating"):
amr = str(atr.find_element_by_tag_name("dd").text)
elif (atr.find_element_by_tag_name("dt").text == "Supporting actors"):
supporting_actors = str(atr.find_element_by_tag_name("dd").text)
except:
traceback.print_stack(file=sys.stdout)
# break
movie = Movie(title=title, description=description, director=director,
starring=starring, genre=genres, subtitles=subtitles, audio_languages=audio_languages,
producer=producers,
studio=studio, amazon_maturity_rating=amr, supporting_actors=supporting_actors)
add_data(movie)
def start():
driver = webdriver.Chrome(executable_path="C:\\Users\\Vishank\\Selenium Web Drivers\\chromedriver.exe")
driver.get("https://www.primevideo.com")
element = driver.find_element_by_id("pv-nav-sign-in")
element.click()
element = driver.find_element_by_id("ap_email")
element.send_keys(user_name)
element = driver.find_element_by_id("ap_password")
element.send_keys(password)
element.send_keys(Keys.RETURN)
element = driver.find_element_by_id("pv-nav-movies")
element.click()
scroll_pause_time = 2
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(scroll_pause_time)
break
if new_height == last_height:
break
pages = driver.find_elements_by_css_selector("._4XLGta")
print(pages)
see_more_urls = []
for page in pages:
if page.text.startswith("Movies genres"):
see_more_urls = page.find_elements_by_css_selector("._2LQvA6 a")
temp = []
for a in see_more_urls:
if a.get_attribute("href") not in temp:
temp.append(a.get_attribute("href"))
for url in temp:
time.sleep(2)
driver.get(url)
time.sleep(2)
scroll(driver, 2)
driver.quit()
def test_func():
# movie = Movie(title="English Medium", description="Best Movie", director="Aaad,Bdda,sva,",
# starring="Kvnasun asindd, asiudn as", genre="Thriller", imdb_rating="9.8",
# duration="1hr 20mins", year_released="2020", age_category="13+",
# subtitles="English, Hindi", audio_languages="Hindi", producer="Vishank Anuj",
# studio="Pascal API", amazon_maturity_rating="13+", supporting_actors="jwin sas")
movie = Movie(title="English Medium", description="Best Movie", director="Aaad,Bdda,sva,",
starring="Kvnasun asindd, asiudn as", genre="Thriller",
subtitles="English, Hindi", audio_languages="Hindi")
add_data(movie)
if __name__ == '__main__':
start()