From e94be0c3c98e71940552b6c43e0adacc75f1ea63 Mon Sep 17 00:00:00 2001 From: "Yuan (Bob) Gong" Date: Wed, 18 Dec 2019 10:49:58 +0800 Subject: [PATCH] Moving go license tools from KFP repo (#540) * Moving go license tools from KFP repo * Move go license tools to py folder * Add __init__.py file * Update setup.py * Fix pylint format * Fix lint errors --- .../testing/go-license-tools/README.md | 46 ++++ .../testing/go-license-tools/__init__.py | 0 .../go-license-tools/concatenate_license.py | 82 +++++++ .../get_github_license_info.py | 107 +++++++++ .../go-license-tools/get_github_repo.py | 211 ++++++++++++++++++ .../go-license-tools/parse_toml_dep.py | 51 +++++ py/kubeflow/testing/go-license-tools/setup.py | 42 ++++ 7 files changed, 539 insertions(+) create mode 100644 py/kubeflow/testing/go-license-tools/README.md create mode 100644 py/kubeflow/testing/go-license-tools/__init__.py create mode 100644 py/kubeflow/testing/go-license-tools/concatenate_license.py create mode 100644 py/kubeflow/testing/go-license-tools/get_github_license_info.py create mode 100644 py/kubeflow/testing/go-license-tools/get_github_repo.py create mode 100644 py/kubeflow/testing/go-license-tools/parse_toml_dep.py create mode 100644 py/kubeflow/testing/go-license-tools/setup.py diff --git a/py/kubeflow/testing/go-license-tools/README.md b/py/kubeflow/testing/go-license-tools/README.md new file mode 100644 index 00000000000..8730f04f1b9 --- /dev/null +++ b/py/kubeflow/testing/go-license-tools/README.md @@ -0,0 +1,46 @@ +# CLI tools to fetch go library's license info + +## Why we need this? + +When we release go library images (can be considered as redistributing third +party binary). + +We need to do the following to be compliant: +* Put license declarations in the image for licences of all dependencies and transistive dependencies. +* Mirror source code in the image for code with MPL, EPL, GPL or CDDL licenses. + +It's not an easy task to get license of all (transitive) dependencies of a go +library. Thus, we need these tools to automate this task. + +## How to get all dependencies with license and source code? + +1. Install CLI tools here: `python setup.py install` +1. Collect dependencies + transitive dependencies in a go library. Put them together in a text file called `dep.txt`. Format: each line has a library name. The library name should be a valid golang import module name. + + Example ways to get it: + * argo uses gopkg for package management. It has a [Gopkg.lock file](https://github.com/argoproj/argo/blob/master/Gopkg.lock) + with all of its dependencies and transitive dependencies. All the name fields in this file is what we need. You can run `parse-toml-dep` to parse it. + * minio uses [official go modules](https://blog.golang.org/using-go-modules), there's a [go.mod file](https://github.com/minio/minio/blob/master/go.mod) describing its direct dependencies. Run command `go list -m all` to get final versions that will be used in a build for all direct and indirect dependencies, [reference](https://github.com/golang/go/wiki/Modules#daily-workflow). Parse its output to make a file we need. + + Reminder: don't forget to put the library itself into `dep.txt`. +1. Run `get-github-repo` to resolve github repos of golang imports. Not all +imports can be figured out by my script, needs manual help for <2% of libraries. + + For a library we cannot resolve, manually put it in `dep-repo-mapping.manual.csv`, so the tool knows how to find its github repo the next time. + + Defaults to read dependencies from `dep.txt` and writes to `repo.txt`. +1. Run `get-github-license-info` to crawl github license info of these libraries. (Not all repos have github recognizable license, needs manual help for <2% of libraries) + + Defaults to read repos from `repo.txt` and writes to `license-info.csv`. You + need to configure github personal access token because it sends a lot of + requests to github. Follow instructions in `get-github-license-info -h`. + + For repos that fails to fetch license, it's usually because their github repo + doesn't have a github understandable license file. Check its readme and + update correct info into `license-info.csv`. (Usually, use its README file which mentions license.) +1. Edit license info file. Manually check the license file for all repos with a license categorized as "Other" by github. Figure out their true license names. +1. Run `concatenate-license` to crawl full text license files for all dependencies and concat them into one file. + + Defaults to read license info from `license-info.csv`. Writes to `license.txt`. + Put `license.txt` to `third_party/library/license.txt` where it is read when building docker images. +1. Manually update a list of dependencies that requires source code, put it into `third_party/library/repo-MPL.txt`. diff --git a/py/kubeflow/testing/go-license-tools/__init__.py b/py/kubeflow/testing/go-license-tools/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/py/kubeflow/testing/go-license-tools/concatenate_license.py b/py/kubeflow/testing/go-license-tools/concatenate_license.py new file mode 100644 index 00000000000..0c2ef54a798 --- /dev/null +++ b/py/kubeflow/testing/go-license-tools/concatenate_license.py @@ -0,0 +1,82 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import requests +import sys +import traceback + +parser = argparse.ArgumentParser( + description='Generate dependencies json from license.csv file.') +parser.add_argument( + 'license_info_file', + nargs='?', + default='license_info.csv', + help='CSV file with license info fetched from github using get-github-license-info CLI tool.' + +'(default: %(default)s)', +) +parser.add_argument( + '-o', + '--output', + dest='output_file', + nargs='?', + default='license.txt', + help= + 'Concatenated license file path this command generates. (default: %(default)s)' +) +args = parser.parse_args() + + +def fetch_license_text(download_link): + response = requests.get(download_link) + assert response.ok, 'Fetching {} failed with {} {}'.format( + download_link, response.status_code, response.reason) + return response.text + + +def main(): + with open(args.license_info_file, + 'r') as license_info_file, open(args.output_file, + 'w') as output_file: + repo_failed = [] + for line in license_info_file: + line = line.strip() + [repo, license_link, license_name, + license_download_link] = line.split(',') + try: + print('Repo {} has license download link {}'.format( + repo, license_download_link), + file=sys.stderr) + license_text = fetch_license_text(license_download_link) + print( + '--------------------------------------------------------------------------------', + file=output_file, + ) + print('{} {} {}'.format(repo, license_name, license_link), + file=output_file) + print( + '--------------------------------------------------------------------------------', + file=output_file, + ) + print(license_text, file=output_file) + except Exception as e: # pylint: disable=broad-except + print('[failed]', e, file=sys.stderr) + traceback.print_exc(file=sys.stderr) + repo_failed.append(repo) + print('Failed to download license file for {} repos.'.format(len(repo_failed)), file=sys.stderr) + for repo in repo_failed: + print(repo, file=sys.stderr) + + +main() diff --git a/py/kubeflow/testing/go-license-tools/get_github_license_info.py b/py/kubeflow/testing/go-license-tools/get_github_license_info.py new file mode 100644 index 00000000000..2c960f0014d --- /dev/null +++ b/py/kubeflow/testing/go-license-tools/get_github_license_info.py @@ -0,0 +1,107 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import requests +import sys +import traceback +from pathlib import Path + +home = str(Path.home()) +parser = argparse.ArgumentParser( + description='Get github license info from github APIs.') +parser.add_argument( + 'repo_list', + nargs='?', + default='repo.txt', + help= + 'Github repo list file with one line per github repo. Format: org/repo. (default: %(default)s)', +) +parser.add_argument( + '-o', + '--output', + dest='output_file', + nargs='?', + default='license_info.csv', + help= + 'Output file with one line per github repo. Line format: ' + +'org/repo,license_html_url,license_name,license_download_url. (default: %(default)s)', +) +parser.add_argument( + '--github-api-token-file', + dest='github_api_token_file', + default='{}/.github_api_token'.format(home), + help='You need to create a github personal access token at https://github.com/settings/tokens, ' + +'because github has very strict limit on anonymous API usage. (default: %(default)s) Format: a ' + +'text file with one line. ' + +'"<40 characters string shown when a new personal access token is created>"' +) +args = parser.parse_args() + + +def main(): + token = None + try: + with open(args.github_api_token_file, 'r') as token_file: + token = token_file.read().strip() + print('Read github API token from {}, length {}.'.format( + args.github_api_token_file, len(token)), + file=sys.stderr) + except FileNotFoundError: + raise Exception(( + 'Please put a github api token file at {}, or specify a different token file path by ' + +'--github-api-token-file. Github API token is needed because anonymous API access limit ' + +'is not enough.' + ).format(args.github_api_token_file)) + + # github personal access token is always 40 characters long + assert len(token) == 40 + # reference: https://developer.github.com/v3/#oauth2-token-sent-in-a-header + headers = {'Authorization': 'token {}'.format(token)} + with open(args.repo_list, + 'r') as repo_list_file, open(args.output_file, + 'w') as output_file: + repo_succeeded = [] + repo_failed = [] + for repo in repo_list_file: + repo = repo.strip() + print('Fetching license for {}'.format(repo), file=sys.stderr) + try: + url = 'https://api.github.com/repos/{}/license'.format(repo) + response = requests.get(url, headers=headers) + if not response.ok: + print('Error response content:\n{}'.format(response.content), file=sys.stderr) + raise Exception('fetching {} failed with {} {}'.format( + url, response.status_code, response.reason)) + data = response.json() + + download_url = data['download_url'] + license_name = data['license']['name'] + html_url = data['html_url'] + print('{},{},{},{}'.format(repo, html_url, license_name, download_url), file=output_file) + repo_succeeded.append(repo) + except Exception as e: # pylint: disable=broad-except + print('[failed]', e, file=sys.stderr) + traceback.print_exc(file=sys.stderr) + repo_failed.append(repo) + print('Fetched github license info, {} succeeded, {} failed.'.format( + len(repo_succeeded), len(repo_failed)), file=sys.stderr) + if repo_failed: + print('The following repos failed:', file=sys.stderr) + for repo in repo_failed: + print(repo, file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/py/kubeflow/testing/go-license-tools/get_github_repo.py b/py/kubeflow/testing/go-license-tools/get_github_repo.py new file mode 100644 index 00000000000..f70c9599067 --- /dev/null +++ b/py/kubeflow/testing/go-license-tools/get_github_repo.py @@ -0,0 +1,211 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import requests +import sys +import traceback +from bs4 import BeautifulSoup as Soup + +parser = argparse.ArgumentParser( + description='Get github repo from go import path.') +parser.add_argument( + 'go_dependency_list_file', + nargs='?', + default='dep.txt', + help= + 'File path of a golang dependency list file, one line has a dependency name. ' + +'(default: %(default)s)', +) +parser.add_argument( + '-o', + '--output', + dest='output_file', + nargs='?', + default='repo.txt', + help= + 'Output file with one line per resolved github repo. Format: org/repo. (default: %(default)s)', +) +parser.add_argument( + '--manual-dep-repo-mapping', + dest='manual_dep_repo_mapping_file', + nargs='?', + default='dep_repo.manual.csv', + help= + 'Optional dependency to repo mapping maintained manually for dependencies we cannot ' + +'automatically resolve. Format: each line has dependency import name and its github repo ' + +'separated by comma. Like, "upper.io/db.v3,upper/db". Note: github/upper/db is the repo. ' + +'(default: %(default)s)' +) +args = parser.parse_args() + +protocol = 'https://' +godoc_base = 'godoc.org/' +github_base = 'github.com/' +gopkg_base = 'gopkg.in/' + + +def github_link_to_repo(repo): + ''' + Removes extra sub folder in github url. + ''' + if len(repo.split('/')) > 2: + print('repo {} has subfolder'.format(repo), file=sys.stderr) + repo = '/'.join(repo.split('/')[:2]) + assert len(repo.split( + '/')) == 2, 'repo name should be org/repo, but is {}'.format(repo) + return repo + + +def get_github_repo(url): + ''' + Tries to resolve github repo from a github url. + Returns org/repo format github repo string. + ''' + if url.startswith(protocol): + url = url[len(protocol):] + if not url.startswith(github_base): + raise Exception('Package url is not github: {}'.format(url)) + github_repo = url[len(github_base):] + github_repo = github_link_to_repo(github_repo) + if github_repo[-1] == '/': + github_repo = github_repo[:-1] + return github_repo + + +def fetch_github_uri_from_godoc(url): + ''' + Tries to resolve github repo from godoc website. + + Implementation: Godoc is a standard way for a lot of golang libraries to + host its documentation. Godoc page usually has a link on top left with + github repo url. This function crawls godoc page for the library and finds + the github url there. If the link there isn't a github url, it throws an + exception. + ''' + full_url = protocol + godoc_base + url + print('fetching godoc {}'.format(full_url), file=sys.stderr) + response = requests.get(full_url) + assert response.ok, 'it failed with {} {}'.format(response.status_code, + response.reason) + + soup = Soup(response.text, features="html.parser") + navs = soup.select('#x-projnav') + if len(navs) != 1: + raise Exception( + '#x-projnav should occur exactly once, but {} found for {}'.format(len(navs), url)) + nav = navs[0] + package_name = nav.select_one('span').contents[0] + assert package_name == url, 'fetched package name should be the same' + link = nav.select_one('a').attrs.get('href') + return get_github_repo(link) + + +def fetch_gopkg_uri(url): + ''' + Tries to resolve github repo for gopkg libraries. + + Implementation: gopkg library page has a button with text 'Source code', its + url is usually the corresponding github repo. Throws an exception if the url + found is not github. + ''' + response = requests.get(protocol + url) + assert response.ok, 'fetching {} failed with {} {}'.format( + url, response.status_code, response.reason) + + soup = Soup(response.text, features="html.parser") + + def is_source_code_link(link): + return link.getText().find('Source Code') >= 0 + + source_code_links = list(filter(is_source_code_link, soup.select('a'))) + assert len( + source_code_links) == 1, 'Expect exactly one source code link found' + + link = source_code_links[0].attrs.get('href') + return get_github_repo(link) + + +def get_github_repo_for_dep(dep): + ''' + Tries to resolve github repo by three ways: + 1. fetch gopkg website + 2. parse from github url + 3. fetch godoc website + ''' + print('Fetching github uri for {}'.format(dep), file=sys.stderr) + repo = None + if dep.startswith(gopkg_base): + print('Try fetching {} from gopkg'.format(dep), file=sys.stderr) + repo = fetch_gopkg_uri(dep) + elif dep.startswith(github_base): + print('{} is already github'.format(dep), file=sys.stderr) + repo = get_github_repo(dep) + else: + print('Try fetching {} repo from godoc'.format(dep), file=sys.stderr) + repo = fetch_github_uri_from_godoc(dep) + return repo + + +def main(): + with open(args.go_dependency_list_file, + 'r') as dep_file, open(args.output_file, 'w') as output_file: + mappings = {} + try: + with open(args.manual_dep_repo_mapping_file, 'r') as dep_repo_mapping_file: + for line in dep_repo_mapping_file: + mapping = line.strip().split(',') + assert len(mapping) == 2 + [dep, repo] = mapping + mappings[dep] = repo + except Exception: # pylint: disable=broad-except + print('ignore manual_dep_repo_mapping_file', file=sys.stderr) + deps = [line.strip() for line in dep_file] + repo_seen = set() + dep_succeeded = [] + # Dependencies that we couldn't resolve their github repos. + dep_failed = [] + for dep in deps: + try: + # Get dep's repo from manually maintained mapping first. + repo = mappings.get(dep) + if repo is not None: + print('repo of {} is already configured to {}'.format(dep, repo), file=sys.stderr) + else: + # Try to resolve if not found + repo = get_github_repo_for_dep(dep) + if repo in repo_seen: + print('repo {} is seen more than once'.format(repo), file=sys.stderr) + else: + repo_seen.add(repo) + print(repo, file=output_file) + dep_succeeded.append(dep) + except Exception as e: # pylint: disable=broad-except + print('[failed]', e, file=sys.stderr) + traceback.print_exc(file=sys.stderr) + dep_failed.append(dep) + print() + print(( + 'Successfully resolved github repo for {} dependencies and saved to {}. ' + +'Failed to resolve {} dependencies.' + ).format(len(dep_succeeded), args.output_file, len(dep_failed)), + file=sys.stderr) + if dep_failed: + print('We failed to resolve the following dependencies:', file=sys.stderr) + for dep in dep_failed: + print(dep, file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/py/kubeflow/testing/go-license-tools/parse_toml_dep.py b/py/kubeflow/testing/go-license-tools/parse_toml_dep.py new file mode 100644 index 00000000000..10408af4820 --- /dev/null +++ b/py/kubeflow/testing/go-license-tools/parse_toml_dep.py @@ -0,0 +1,51 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import sys +import toml + +parser = argparse.ArgumentParser( + description='Parse toml format go dependencies maintained by dep tool.') +parser.add_argument('dep_lock_path', + nargs='?', + default='Gopkg.lock', + help='Toml format go dependency lock file.') +parser.add_argument( + '-o', + '--output', + dest='output_file', + nargs='?', + default='dep.txt', + help='Output file with one line per golang library. (default: %(default)s)', +) + +args = parser.parse_args() + + +def main(): + print('Parsing dependencies from {}'.format(args.dep_lock_path), file=sys.stderr) + + with open(args.output_file, 'w') as output_file: + deps = toml.load(args.dep_lock_path) + projects = deps.get('projects') + dep_names = list(map(lambda p: p.get('name'), projects)) + for name in dep_names: + print(name, file=output_file) + + print('Found {} dependencies'.format(len(projects)), file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/py/kubeflow/testing/go-license-tools/setup.py b/py/kubeflow/testing/go-license-tools/setup.py new file mode 100644 index 00000000000..7dbc7c3c074 --- /dev/null +++ b/py/kubeflow/testing/go-license-tools/setup.py @@ -0,0 +1,42 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from setuptools import setup + +NAME = 'go-license-tools' +VERSION = '0.0.1' + +REQUIRES = [ + 'bs4', + 'requests', + 'toml', +] + +setup(name=NAME, + version=VERSION, + description='Go license tools', + author='google', + install_requires=REQUIRES, + packages=[ + '.', + ], + python_requires='>=3.5.3', + entry_points={ + 'console_scripts': [ + 'get-github-repo = get_github_repo:main', + 'get-github-license-info = get_github_license_info:main', + 'concatenate-license = concatenate_license:main', + 'parse-toml-dep = parse_toml_dep:main', + ] + })