From 4781af9600c98b87df7e7c21140ad49afeec2273 Mon Sep 17 00:00:00 2001 From: Laura Weber Date: Mon, 13 Jul 2020 15:51:30 -0700 Subject: [PATCH 1/2] Can include additional headers in GitHub queries. --- scraper/github/queryManager.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/scraper/github/queryManager.py b/scraper/github/queryManager.py index 1efd76e..f06799f 100644 --- a/scraper/github/queryManager.py +++ b/scraper/github/queryManager.py @@ -185,6 +185,7 @@ def queryGitHub( rest=False, requestCount=0, pageNum=0, + headers={}, ): """Submit a GitHub query. @@ -217,6 +218,8 @@ def queryGitHub( requestCount (Optional[int]): Counter for repeated requests. pageNum (Optional[int]): Counter for pagination. For user readable log messages only, does not affect data. + headers (Optional[Dict]): Additional headers. + Defaults to empty. Returns: Dict: A JSON style dictionary. @@ -232,7 +235,11 @@ def queryGitHub( (verbosity >= 0), "Sending %s query..." % ("REST" if rest else "GraphQL") ) response = self._submitQuery( - gitquery, gitvars=gitvars, verbose=(verbosity > 0), rest=rest + gitquery, + gitvars=gitvars, + verbose=(verbosity > 0), + rest=rest, + headers=headers, ) _vPrint((verbosity >= 0), "Checking response...") _vPrint((verbosity >= 0), "HTTP/1.1 " + response["headDict"]["Status"]) @@ -263,6 +270,7 @@ def queryGitHub( rest=rest, requestCount=(requestCount - 1), pageNum=pageNum, + headers=headers, ) except KeyError: # Handles error cases that don't return X-RateLimit data @@ -295,6 +303,7 @@ def queryGitHub( rest=rest, requestCount=requestCount, pageNum=pageNum, + headers=headers, ) # Check for server error responses if statusNum == 502 or statusNum == 503: @@ -323,6 +332,7 @@ def queryGitHub( rest=rest, requestCount=requestCount, pageNum=pageNum, + headers=headers, ) # Check for other error responses if statusNum >= 400 or statusNum == 204: @@ -366,6 +376,7 @@ def queryGitHub( rest=rest, requestCount=requestCount, pageNum=pageNum, + headers=headers, ) else: raise RuntimeError( @@ -389,6 +400,7 @@ def queryGitHub( rest=rest, requestCount=0, pageNum=pageNum, + headers=headers, ) outObj.extend(nextObj) elif not rest: @@ -415,6 +427,7 @@ def queryGitHub( rest=rest, requestCount=0, pageNum=pageNum, + headers=headers, ) newPage = nextObj for key in keysToList[0:-1]: @@ -424,7 +437,7 @@ def queryGitHub( return outObj - def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False): + def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False, headers={}): """Send a curl request to GitHub. Args: @@ -438,6 +451,8 @@ def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False): suppressed. Defaults to False. rest (Optional[bool]): If True, uses the REST API instead of GraphQL. Defaults to False. + headers (Optional[Dict]): Additional headers. + Defaults to empty. Returns: { @@ -454,11 +469,13 @@ def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False): {"query": gitquery, "variables": json.dumps(gitvars)} ) fullResponse = requests.post( - "https://api.github.com/graphql", data=gitqueryJSON, headers=authhead + "https://api.github.com/graphql", + data=gitqueryJSON, + headers={**authhead, **headers}, ) else: fullResponse = requests.get( - "https://api.github.com" + gitquery, headers=authhead + "https://api.github.com" + gitquery, headers={**authhead, **headers} ) _vPrint( verbose, From 8e0a0fff12cc11084b18520a8272099edca35d23 Mon Sep 17 00:00:00 2001 From: Laura Weber Date: Mon, 13 Jul 2020 16:43:19 -0700 Subject: [PATCH 2/2] Fix mutable defaults. Fix ambiguous variable. --- scraper/code_gov/models.py | 4 ++-- scraper/github/queryManager.py | 39 ++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/scraper/code_gov/models.py b/scraper/code_gov/models.py index 8fbcfde..12278db 100644 --- a/scraper/code_gov/models.py +++ b/scraper/code_gov/models.py @@ -247,7 +247,7 @@ def from_github3(klass, repository, labor_hours=True): project["downloadURL"] = repository.downloads_url - project["languages"] = [l for l, _ in repository.languages()] + project["languages"] = [lang for lang, _ in repository.languages()] # project['partners'] = [] @@ -337,7 +337,7 @@ def from_gitlab(klass, repository, labor_hours=True, fetch_languages=False): archive_suffix = "/projects/%s/repository/archive" % repository.get_id() project["downloadURL"] = api_url + archive_suffix - # project['languages'] = [l for l, _ in repository.languages()] + # project['languages'] = [lang for lang, _ in repository.languages()] if fetch_languages: project["languages"] = [*repository.languages()] diff --git a/scraper/github/queryManager.py b/scraper/github/queryManager.py index f06799f..2f61f8d 100644 --- a/scraper/github/queryManager.py +++ b/scraper/github/queryManager.py @@ -143,7 +143,7 @@ def _readGQL(self, filePath, verbose=False): self.__query = query_in return query_in - def queryGitHubFromFile(self, filePath, gitvars={}, verbosity=0, **kwargs): + def queryGitHubFromFile(self, filePath, gitvars=None, verbosity=0, **kwargs): """Submit a GitHub GraphQL query from a file. Can only be used with GraphQL queries. @@ -156,7 +156,7 @@ def queryGitHubFromFile(self, filePath, gitvars={}, verbosity=0, **kwargs): .. _GitHub GraphQL Explorer: https://developer.github.com/v4/explorer/ gitvars (Optional[Dict]): All query variables. - Defaults to empty. + Defaults to None. GraphQL Only. verbosity (Optional[int]): Changes output verbosity levels. If < 0, all extra printouts are suppressed. @@ -169,6 +169,9 @@ def queryGitHubFromFile(self, filePath, gitvars={}, verbosity=0, **kwargs): Dict: A JSON style dictionary. """ + if not gitvars: + gitvars = {} + gitquery = self._readGQL(filePath, verbose=(verbosity >= 0)) return self.queryGitHub( gitquery, gitvars=gitvars, verbosity=verbosity, **kwargs @@ -177,15 +180,15 @@ def queryGitHubFromFile(self, filePath, gitvars={}, verbosity=0, **kwargs): def queryGitHub( self, gitquery, - gitvars={}, + gitvars=None, verbosity=0, paginate=False, cursorVar=None, - keysToList=[], + keysToList=None, rest=False, requestCount=0, pageNum=0, - headers={}, + headers=None, ): """Submit a GitHub query. @@ -195,7 +198,7 @@ def queryGitHub( query: 'query { viewer { login } }' endpoint: '/user' gitvars (Optional[Dict]): All query variables. - Defaults to empty. + Defaults to None. GraphQL Only. verbosity (Optional[int]): Changes output verbosity levels. If < 0, all extra printouts are suppressed. @@ -209,7 +212,7 @@ def queryGitHub( GraphQL Only. keysToList (Optional[List[str]]): Ordered list of keys needed to retrieve the list in the query results to be extended by - pagination. Defaults to empty. + pagination. Defaults to None. Example: ['data', 'viewer', 'repositories', 'nodes'] GraphQL Only. @@ -219,12 +222,19 @@ def queryGitHub( pageNum (Optional[int]): Counter for pagination. For user readable log messages only, does not affect data. headers (Optional[Dict]): Additional headers. - Defaults to empty. + Defaults to None. Returns: Dict: A JSON style dictionary. """ + if not gitvars: + gitvars = {} + if not keysToList: + keysToList = [] + if not headers: + headers = {} + requestCount += 1 pageNum = 0 if pageNum < 0 else pageNum # no negative page numbers pageNum += 1 @@ -437,7 +447,9 @@ def queryGitHub( return outObj - def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False, headers={}): + def _submitQuery( + self, gitquery, gitvars=None, verbose=False, rest=False, headers=None + ): """Send a curl request to GitHub. Args: @@ -446,13 +458,13 @@ def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False, headers= query: 'query { viewer { login } }' endpoint: '/user' gitvars (Optional[Dict]): All query variables. - Defaults to empty. + Defaults to None. verbose (Optional[bool]): If False, stderr prints will be suppressed. Defaults to False. rest (Optional[bool]): If True, uses the REST API instead of GraphQL. Defaults to False. headers (Optional[Dict]): Additional headers. - Defaults to empty. + Defaults to None. Returns: { @@ -463,6 +475,11 @@ def _submitQuery(self, gitquery, gitvars={}, verbose=False, rest=False, headers= } """ + if not gitvars: + gitvars = {} + if not headers: + headers = {} + authhead = {"Authorization": "bearer " + self.__githubApiToken} if not rest: gitqueryJSON = json.dumps(