forked from alexandru/AndroidMarketCrawler
-
Notifications
You must be signed in to change notification settings - Fork 2
/
android_app_fetcher.py
243 lines (209 loc) · 9.2 KB
/
android_app_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
import urllib
import simplejson as json
import sys
import urllib2
# using PyQuery for querying retrieved HTML content using CSS3
# selectors (awesome!)
from pyquery import pyquery as pq
from datetime import datetime
class AndroidAppFetcher(object):
'''
Fetcher of a single app.
'''
urllib = urllib2
def __init__(self, url, lang='en'):
'''
lang is used to specify the result language of the crawler.
'''
self.url = url
self.browser = self.urllib.build_opener()
self.browser.addheaders.append(('Cookie', 'hlSession2=%s'%lang))
self.app_info = None
self.all_links = []
def fetch_content(self):
"""
Fetches the content of an URL, gets app links from it and
pushes them down the queue. Then parses the content to
determine if it is an app and if it is, then push the parsed
result in the `results` queue for later processing.
This logic is getting executed inside green threads. You
shouldn't spawn new green threads here, as this is not the
parent and trouble may arise.
"""
try:
resp = self.browser.open(self.url)
except urllib2.HTTPError, ex:
# silently ignores errors, even though the script will not
# block here.
if ex.code == 404:
return
# this is a slight problem, it shouldn't happen but it
# does sometimes, so keeping tracking is useful to see how
# often it does happen
raise urllib2.HTTPError, ex
content = resp.read()
self.doc = pq.PyQuery(content.decode('utf-8'))
# we must do our best to ignore pages that are not
# relevant (music, movies, other pages that don't have
# links to apps in them)
if not self.is_page_valid():
return
# I like keeping a log of URLs processed
sys.stderr.write(self.url + "\n")
# fetches links in this page, by regular expressions.
# we are interested in app links and publisher links.
self.all_links = [
a.attrib['href']
for a in self.doc('a')
if re.search(r'\/(details|developer)[?]', a.attrib.get('href', '')) \
and not re.search('reviewId', a.attrib.get('href', '')) \
and not re.search('accounts\/ServiceLogin', a.attrib.get('href', ''))
]
# fetches app info from the fetched content, but ONLY in
# case the URL is about an actual app
self.app_info = self.fetch_app_info()
def is_page_valid(self):
"""
This is a hackish method to determine if the visited page is
useful at all.
The big problem is that I cannot infer the type of item I've
got just from the link. Links for audio, movies and apps have
the same format.
`doc` is therefore an instantiated PyQuery document with the
fetched content.
What this buys us is that we can then ignore links from
invalid pages (as movies will tend to link to other movies,
not to other apps).
"""
if self.url == "https://play.google.com/store/apps/":
return True
if self.url.startswith("https://play.google.com/store/apps/details?id=apps_topselling_paid"):
return True
if self.url.startswith("https://play.google.com/store/apps/details?id=apps_topselling_free"):
return True
if not re.search(r'details|developer', self.url):
return False
if re.search('reviewId', self.url):
return False
params = self.query_vars(self.url)
if not params.get('id') and not params.get('pub'):
return False
if re.search(r'developer', self.url):
if not (self.doc('h1.page-banner-text').text() or '').lower().startswith('apps by'):
return False
return True
if not self.doc('div.apps.details-page'):
return False
if not any( [re.search(r'/apps', a.get('href', '')) for a in self.doc('.breadcrumbs a')] ):
return True
#if 'Apps' not in self.doc('.page-content .breadcrumbs a').text():
# return False
return True
def fetch_app_info(self):
"""
At this point, we are almost sure we have an app, so this
method attempts parsing the content into a dictionary.
We are using PyQuery and CSS3 selectors heavily.
"""
params = self.query_vars(self.url)
if not params.get('id'): return None
if not self.doc('div.apps.details-page'): return None
#if not any( [re.search(r'/apps', a.get('href', '')) for a in self.doc('.breadcrumbs a')] ):
# return None
#if 'Apps' not in self.doc('.page-content .breadcrumbs a').text():
# return None
app_info = {
'uid': params['id'],
'name': self.doc('h1.doc-banner-title').text(),
'app_link': self.absolute_url('/details?id=' + params['id']),
'dev_name': self.doc('a.doc-header-link').text(),
'dev_link': self.absolute_url(self.doc('a.doc-header-link').attr['href']),
'dev_web_links': list(set([
self.query_vars(a.attrib['href'])['q']
for a in self.doc('.doc-overview a')
if a.text and "Visit Developer's Website" in a.text
])),
'dev_emails': list(set([
a.attrib['href'][len('mailto:'):]
for a in self.doc('.doc-overview a')
if a.attrib.get('href', '').startswith('mailto:')
])),
'date_published': datetime.strptime(self.doc('[itemprop=datePublished]').text(),'%B %d, %Y').strftime('%Y-%m-%d'),
'rating_count': int(re.sub(r'\D+', '', self.doc('[itemprop=ratingCount]').text() or '0')),
'rating_value': self.doc('[itemprop=ratingValue]').attr['content'],
'description_html': self.doc('#doc-original-text').html(),
'more-from-developer': [
self.query_vars(a.attrib['href'])['id']
for a in self.doc('[data-analyticsid=more-from-developer] a.common-snippet-title')
],
'users_also_installed': [
self.query_vars(a.attrib['href'])['id']
for a in self.doc('[data-analyticsid=users-also-installed] a.common-snippet-title')
],
'users_also_viewed': [
self.query_vars(a.attrib['href'])['id']
for a in self.doc('[data-analyticsid=related] a.common-snippet-title')
],
'icon_link': self.doc('.doc-banner-icon img').attr('src'),
'screenshot_links': [ a.get('src') for a in self.doc('.screenshot-carousel-content-container img') ],
'banner_link': self.doc('.doc-banner-image-container img').attr('src'),
}
match = re.findall(r'.*[\d\.]+', self.doc('.buy-button-price').text())
if match:
app_info['is_free'] = False
app_info['price'] = match[0]
else:
app_info['is_free'] = True
app_info['price'] = 0
match = [a.text for a in self.doc('.doc-metadata-list dd a') if 'category' in a.attrib.get('href')]
if match: app_info['category'] = match[0]
match = [re.search(r'/store/apps/category/(\w+)\?', a.get('href', '')) for a in self.doc('dd a')]
if match and match[-1]:
app_info['category_tag'] = match[-1].groups()[0]
match = re.findall('([\d,]+)\s*-\s*([\d,]+)', self.doc('[itemprop=numDownloads]').text() or '')
if match:
imin, imax = [re.sub(r'\D+', '', m) for m in match[0]]
app_info['installs_min'] = int(imin)
app_info['installs_max'] = int(imax)
return app_info
def get_id(self, url):
"""
Extracts the ID param from a Marketplace URL.
"""
params = self.query_vars(url)
return params.get('id')
def query_vars(self, url):
"""
Parses the query part of an URL. It was faster to implement
this myself, than to find something already available.
"""
v = {}
match = re.findall('[^?]+[?](.*)$', url)
if match:
query = match[0]
parts = query.split('&')
for part in parts:
keyval = [urllib.unquote_plus(i) for i in part.split('=', 1)]
key, val = keyval if len(keyval) == 2 else (keyval[0], '')
v[key] = val
return v
def absolute_url(self, url):
"""
Converts relative URL to a Marketplace absolute URL.
"""
if url and url.startswith('/'):
return "https://play.google.com/store/apps" + url
return url or ''
if __name__ == '__main__':
if len(sys.argv) <= 1:
sys.stderr.write("\nERROR: target package name is missing!\n\n")
sys.exit(1)
url = "https://play.google.com/store/apps/details?id=%s"%sys.argv[1]
fetcher = AndroidAppFetcher(url)
fetcher.fetch_content()
if fetcher.app_info:
print json.dumps(fetcher.app_info, indent=' ')