forked from rsain/GitHub-Crawler
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetDataFromGitHub.py
176 lines (147 loc) · 6.32 KB
/
getDataFromGitHub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# This script allows to crawl information and repositories from GitHub using the GitHub REST API (https://developer.github.com/v3/search/).
#
# Given a query, the script downloads for each repository returned by the query its ZIP file.
# In addition, it also generates a CSV file containing the list of repositories queried.
# For each query, GitHub returns a json file which is processed by this script to get information about repositories.
#
# The GitHub API limits the queries to get 100 elements per page and up to 1,000 elements in total.
# To get more than 1,000 elements, the main query should be splitted in multiple subqueries using different time windows through the constant SUBQUERIES (it is a list of subqueries).
#
# As example, constant values are set to get the repositories on GitHub of the user 'rsain'.
#############
# Libraries #
#############
import wget
import time
import simplejson
import pycurl
import math
import os
try:
# Python 3
from io import BytesIO
except ImportError:
# Python 2
from StringIO import StringIO as BytesIO
#############
# Constants #
#############
ACCESS_TOKEN = ""
URL = "https://api.github.com/search/repositories?access_token=" + ACCESS_TOKEN + "&q=" #The basic URL to use the GitHub API
QUERY = "user:DerZc+" #The personalized query (for instance, to get repositories from user 'rsain')
SUBQUERIES = ["language:C+language:cpp+stars:>=50"] #Different subqueries if you need to collect more than 1000 elements
PARAMETERS = "&per_page=100" #Additional parameters for the query (by default 100 items per page)
DELAY_BETWEEN_QUERYS = 10 #The time to wait between different queries to GitHub (to avoid be banned)
OUTPUT_FOLDER = "/home/zhangchi/Github-C/" #Folder where ZIP files will be stored
OUTPUT_TXT_FILE = "/home/zhangchi/Github-C/" #Path to the txt file generated as output
MINIMUM_PROJECT_NUM = 10000 #The minimum num of projects
#############
# Functions #
#############
def getUrl (url) :
''' Given a URL it returns its body '''
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, buffer)
try:
c.perform()
except Exception as e:
print(e)
time.sleep(DELAY_BETWEEN_QUERYS)
c.perform()
c.close()
body = buffer.getvalue().decode('utf-8')
return body
########
# MAIN #
########
def downProj ():
#To save the number of repositories processed
countOfRepositories = 0
currentMaxStars = 200000
#Output CSV file which will contain information about repositories
f = open(OUTPUT_TXT_FILE + "repositories.txt", "a+")
#Run queries to get information in json format and download ZIP file for each repository
#for subquery in range(1, len(SUBQUERIES)+1):
while currentMaxStars > 50:
#print "Processing subquery " + str(subquery) + " of " + str(len(SUBQUERIES)) + " ..."
#print("Processing subquery %d of %d ..." %(subquery, len(SUBQUERIES)))
#Obtain the number of pages for the current subquery (by default each page contains 100 items)
#url = URL + QUERY + str(SUBQUERIES[subquery-1]) + PARAMETERS
#url = URL + str(SUBQUERIES[subquery-1]) + PARAMETERS
url = URL + "language:C+language:cpp+stars:50.." + str(currentMaxStars) + PARAMETERS
print("query url: ", url)
try:
dataRead = simplejson.loads(getUrl(url))
except Exception as e:
print(e)
time.sleep(DELAY_BETWEEN_QUERYS)
return 1
numberOfPages = int(math.ceil(dataRead.get('total_count')/100.0))
#Results are in different pages
for currentPage in range(1, 11):
#print "Processing page " + str(currentPage) + " of " + str(numberOfPages) + " ..."
print("Processing page %d of %d ..." %(currentPage, numberOfPages))
#url = URL + QUERY + str(SUBQUERIES[subquery-1]) + PARAMETERS + "&page=" + str(currentPage)
#url = URL + str(SUBQUERIES[subquery-1]) + PARAMETERS + "&page=" + str(currentPage)
url = URL + "language:C+language:cpp+stars:50.." + str(currentMaxStars) + PARAMETERS + "&page=" + str(currentPage)
print("current page url: ", url)
try:
dataRead = simplejson.loads(getUrl(url))
except Exception as e:
print(e)
if countOfRepositories > MINIMUM_PROJECT_NUM:
return 0
else:
time.sleep(DELAY_BETWEEN_QUERYS)
return 1
#print(dataRead)
#Iteration over all the repositories in the current json content page
for item in dataRead['items']:
#Obtain user and repository names
user = item['owner']['login']
repository = item['name']
stargazers_count = item['stargazers_count']
if currentPage == 10 :
currentMaxStars = int(stargazers_count)
#print(stargazers_count)
#f.write("user: " + user + "; repository: " + repository + "\n")
#print(user, ' ', repository)
#Download the zip file of the current project
print ("Downloading repository '%s' from user '%s' ..." %(repository,user))
url = item['clone_url']
fileToDownload = url[0:len(url)-4] + "/archive/master.zip"
fileName = item['full_name'].replace("/","#") + ".zip"
if os.path.exists(OUTPUT_FOLDER + fileName):
countOfRepositories = countOfRepositories + 1
continue
print("download url: " + fileToDownload)
try:
wget.download(fileToDownload, out=OUTPUT_FOLDER + fileName)
except:
continue
#https://github.com/antirez/redis/archive/unstable.zip
#try:
# fileToDownload = url[0:len(url)-4] + "/archive/unstable.zip"
# wget.download(fileToDownload, out=OUTPUT_FOLDER + fileName)
#except Exception as e:
# ef = open(OUTPUT_TXT_FILE + "error.txt", "a+")
# ef.write(e)
# ef.close()
#Update repositories counter
f.write("user: " + user + "; repository: " + repository + "\n")
countOfRepositories = countOfRepositories + 1
#A delay between different subqueries
#if (subquery < len(SUBQUERIES)):
#print "Sleeping " + str(DELAY_BETWEEN_QUERYS) + " seconds before the new query ..."
#print("Sleeping %d seconds before the new query ..." %DELAY_BETWEEN_QUERYS)
print("Sleeping %d seconds before the new query ..." %DELAY_BETWEEN_QUERYS)
time.sleep(DELAY_BETWEEN_QUERYS)
#print "DONE! " + str(countOfRepositories) + " repositories have been processed."
print("DONE! %d repositories have been processed." %countOfRepositories)
f.close()
return 0
res = 1
while res:
res = downProj()