-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrevsr_mod.py
159 lines (124 loc) · 4.73 KB
/
revsr_mod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import pycurl, json
from flask import Flask, url_for, json, request
from StringIO import StringIO
from bs4 import BeautifulSoup
import requests
from os import listdir
from os.path import isfile, join
import os.path
import sys
import logging
#for parallelizing
from joblib import Parallel, delayed
import multiprocessing
numfiles=0
# retrieves the reverse search html for processing. This actually does the reverse image lookup
def retrieve(image_url):
returned_code = StringIO()
#full_url = "https://www.google.com/searchbyimage?&image_url=" + image_url
#print "Accessing Url"
#print image_url
#print "\n"
try:
conn = pycurl.Curl()
conn.setopt(conn.URL, str(image_url))
conn.setopt(conn.FOLLOWLOCATION, 1)
conn.setopt(conn.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11')
conn.setopt(conn.WRITEFUNCTION, returned_code.write)
conn.perform()
conn.close()
except Exception, e:
logging.info( "Failed to get info from Google: "+str(e))
google_image_results_parser("")
textHtml = returned_code.getvalue()
return google_image_results_parser(textHtml)
# Parses returned code (html,js,css) and assigns to array using beautifulsoup
def google_image_results_parser(code):
try:
soup = BeautifulSoup(code)
# initialize 2d array
whole_array = {
'original' :[],
'description':[],
'title':[],
'result_qty':[]}
#actual search text
for orig in soup.findAll('a', attrs={'style':'font-style:italic'}):
whole_array['original'].append(orig.get_text())
gtext = ' '.join(whole_array['original'])
logging.info(" Google text : " + str(gtext))
except Exception, e:
logging.info("Failed to parse google response: "+str(e))
return ""
numfiles += 1
return str(gtext);
# Links for all the search results
#for li in soup.findAll('li', attrs={'class':'g'}):
# sLink = li.find('a')
# whole_array['links'].append(sLink['href'])
# Search Result Description
#for desc in soup.findAll('span', attrs={'class':'st'}):
# whole_array['description'].append(desc.get_text())
# Search Result Title
#for title in soup.findAll('h3', attrs={'class':'r'}):
# whole_array['title'].append(title.get_text())
# Number of results
#for result_qty in soup.findAll('div', attrs={'id':'resultStats'}):
# whole_array['result_qty'].append(result_qty.get_text())
#return 1 #build_json_return(whole_array)
#def build_json_return(whole_array):
#return json.dumps(whole_array)
#print "Google text:"
#gtext = ' '.join(whole_array['original'])
#print gtext
#with open("Output.txt", "a") as myfile:
# myfile.write("\n"+gtext)
#print "\n"
#print "description:"
#desc = ' '.join(whole_array['description'])
#print desc
#print "\n"
#print "Title:"
#title = ' '.join(whole_array['title'])
#print title
#print "\n"
#print "results:"
#print ' '.join(whole_array['result_qty'])
#print to file
#text_file = open("Output.txt", "w")
#text_file.write(title)
#text_file.close()
#if __name__ == '__main__':
# app.debug = True
#app.run(host='0.0.0.0')
#retrieve("tajmahal.org.uk/gifs/taj-mahal.jpeg")
#retrieve("103.232.241.5/taj.jpeg")
def processInput(path, num_files):
try:
filePath = path
#print "REVERSE IMAGE SEARCH For "+path
searchUrl = 'http://www.google.com/searchbyimage/upload'
multipart = {'encoded_image': (filePath, open(filePath, 'rb')), 'image_content': ''}
response = requests.post(searchUrl, files=multipart, allow_redirects=False)
fetchUrl = response.headers['Location']
except Exception, e:
logging.info("Failed to encode image search URL:" +str(e))
return retrieve(fetchUrl)
def start_search(files):
path = dir
num_files = len(files)
logging.info("---------- No. of files: "+str(num_files))
#delete Output.txt if it exists
#if os.path.isfile('Output.txt'):
# os.remove('Output.txt')
#get the list of files
inputs = range(num_files)
num_cores = multiprocessing.cpu_count()
results = Parallel(n_jobs=num_cores)(delayed(processInput)(path, num_files) for path in files)
if numfiles == 0:
logging.info("****No result from any image search****")
if numfiles < num_files:
logging.info("Search successful for "+numfiles+" out of "+num_files+" images....")
return results
#for path in onlyfiles:
# processInput(path)