forked from groovecoder/examine-chrome-extensions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_amo.py
70 lines (58 loc) · 2.38 KB
/
crawl_amo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import json
import os
import requests
import sys
root = os.path.join(os.path.abspath(os.curdir), 'extensions')
amo_server = 'https://addons.mozilla.org'
firefox_detail_fields = {
'ID' : ['id'],
'Name' : ['name', 'en-US'], # only grabs US english names, ugh...
'Users' : ['average_daily_users'],
'Rating' : ['ratings', 'average'],
'Num Ratings' : ['ratings', 'count'],
'Developer' : ['authors', 0, 'username'],
'Product Page' : ['url'], # Extension product page
'File' : ['current_version', 'files', 0, 'url']
}
sys.setrecursionlimit(200000)
def fetch(sortOrder, url=None):
# Leave the sort parametere, it seems to prevent duplicate results from being returned.
url = url or (amo_server + '/api/v4/addons/search/?type=extension&page_size=50&app=firefox&sort=' + sortOrder)
print('Fetching: {}'.format(url))
res = requests.get(url)
res.raise_for_status()
res_json = res.json()
for addon in res_json['results']:
current = addon['current_version']
# An extension can have multiple files, but it always seems
# to be a file per OS. Let's just use the first file
# so, later, we don't overcount the APIs used in an extension.
file_obj = current['files'][0]
if file_obj['is_webextension']:
# Store off some details
# id = str(file_obj['id'])
id = str(addon['id'])
json_file = os.path.join(root, 'firefox-details', id + '.json')
if (os.path.exists(json_file)):
print('Details %s already exists, skipping' % id)
continue
res = {}
for field, path in firefox_detail_fields.items():
add_copy = addon
try:
for k in path:
add_copy = add_copy[k]
val = add_copy
except:
val = ''
res[field] = val
json.dump(res, open(json_file, 'w'))
print('Got details for', id)
if res_json['next']:
fetch(sortOrder, res_json['next'])
if __name__=='__main__':
# We run the query twice because AMO has an arbitrary limit of 25,000 results from
# a search query. So by querying the top 25K by users and top 25K by last date
# updated, I get the most popular extensions, as well as the most recent.
fetch('users')
fetch('updated')