-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathscrapper.py
122 lines (105 loc) · 3.97 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
from selenium import webdriver
import os,time,base64,getpass
from bs4 import BeautifulSoup
thisPath=os.path.dirname( os.path.realpath(__file__) )
picklefile="{}/session.pickle".format(thisPath)
imageFile='{}/images.txt'.format(thisPath)
separator=" "
downloadDirectory='{}/download'.format(thisPath)
def getImages():
with open(imageFile, 'r') as myfile:
htmltext=myfile.readlines()
htmltext=[ line.replace('\n',"").split(separator) for line in htmltext]
return htmltext
def getSimilarImagePageLink(url):
print("getting simlar")
selenumdriver.get('https://images.google.com/searchbyimage?image_url={}'.format(url))
htmltext=selenumdriver.page_source
soup = BeautifulSoup(htmltext,"lxml")
a=soup.find('a', attrs={'class': 'iu-card-header'})
if a:
print("images site link: https://www.google.com{}".format(a.get("href")))
return "https://www.google.com{}".format(a.get("href"))
else:
return None
def googleLogin():
googleusername=input("enter google email\n")
password=getpass.getpass(prompt="password for {}\n".format(googleusername))
if not (googleLogin and password):
return
selenumdriver.get("https://accounts.google.com/signin")
selenumdriver.find_element_by_css_selector("input[aria-label='Email or phone']").send_keys(googleusername)
selenumdriver.find_element_by_css_selector("div#identifierNext").click()
time.sleep(2)
selenumdriver.find_element_by_css_selector("input[type='password']").send_keys(password)
time.sleep(0.5)
selenumdriver.find_element_by_css_selector("div#passwordNext").click()
def saveImg(src,name,count):
if "data:image" in src[:20]:
imgdata = base64.b64decode(src.split(",")[1])
extension=src[:40].split(";")[0].split("/")[1]
else:
response = requests.get(src)
imgdata=response.content
extension=response.headers['content-type'].split("/")[1]
filename="{}/{}/{}_{}.{}".format(downloadDirectory,name,name,count,extension)
print("saved image at ({})".format(filename))
if imgdata:
with open(filename, 'wb') as f:
f.write(imgdata)
return True
def downloadFromSimilarImagesPage(url,name,maxNo):
selenumdriver.get(url)
div={}
scrollTo=1080
previous_imgs=0
while len(div)<maxNo:
selenumdriver.execute_script("window.scrollTo(0, {})".format(scrollTo))
time.sleep(2)
htmltext=selenumdriver.page_source
soup = BeautifulSoup(htmltext,"lxml")
div=soup.find("div", attrs={'id': 'search'}).find_all("img")
scrollTo+=500
if len(div) ==0 or len(div)==previous_imgs:
break
previous_imgs=len(div)
os.makedirs("{}/{}".format(downloadDirectory,name), exist_ok=True)
count=0
for img in div:
src= img.get("src") if img.get("src") else img.get("data-src")
if src:
print("saving image {}".format(count))
try:
if saveImg(src,name,count):
count+=1
except Exception:
continue
if count>maxNo:
break
def main():
global selenumdriver
selenumdriver = webdriver.Firefox()
googleLogin()
return
images=getImages()
for each in images:
if "#" not in each[0] and len(each)==3:
name,count,url=each
print(each)
similarImagePageLink=getSimilarImagePageLink(url)
if similarImagePageLink:
downloadFromSimilarImagesPage(url=similarImagePageLink,name=name,maxNo=int(count))
else:
print("cant get similar image search page")
if __name__=="__main__":
try:
main()
except ValueError as e:
print(str(e))
print("quiiting")
print("make sure there is no more than one spaces in {} also at the end".format(imageFile))
finally:
#saveSession()
if 'selenumdriver' in globals():
selenumdriver.close()