Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: trying a license checker #184

Merged
merged 4 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions .github/workflows/license-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
---
name: License Check

"on":
pull_request:
paths:
- '**/pyproject.toml'
- '.github/workflows/license-check.yml'
- '.github/workflows/scripts/check_licenses.py'

jobs:
check-licenses:
name: Check Package Licenses
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install tomli requests urllib3

- name: Check licenses
run: |
python .github/workflows/scripts/check_licenses.py \
pyproject.toml || exit_code=$?
if [ "${exit_code:-0}" -eq 1 ]; then
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we use the condition of "the exit code is not 0" just in case the script returns some error with exit code other than 1? also set the default value to 1 if exit_code does not exist

eg. if [ "${exit_code:-1}" -ne 0 ]

echo "::error::Found packages with disallowed licenses"
exit 1
fi

- name: Check Exchange licenses
run: |
python .github/workflows/scripts/check_licenses.py \
packages/exchange/pyproject.toml || exit_code=$?
if [ "${exit_code:-0}" -eq 1 ]; then
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @michaelneale sorry I did not notice we check the exchange licences here. maybe change here too.

also one small thing about ${exit_code:-0}, shall we change to ${exit_code:-1}? in this case, if the exit_code does not exist, we won't treat it as licence check passing.

echo "::error::Found packages with disallowed licenses in exchange"
exit 1
fi
174 changes: 174 additions & 0 deletions .github/workflows/scripts/check_licenses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/usr/bin/env python3

from pathlib import Path
import tomli
import sys
import requests
import urllib3
from typing import Dict, List, Optional, Set

# Define allowed licenses and exceptions directly in the script
ALLOWED_LICENSES = {
"MIT",
"BSD-3-Clause",
"Apache-2.0",
"Apache Software License",
"Python Software Foundation License",
"BSD License",
"ISC"
}

# Package-specific exceptions
EXCEPTIONS = {
"ai-exchange": True, # Local workspace package
"tiktoken": True, # Known MIT license with non-standard format
}

class LicenseChecker:
def __init__(self):
self.session = requests.Session()
# Configure session for robust SSL handling
self.session.verify = True
adapter = requests.adapters.HTTPAdapter(
max_retries=urllib3.util.Retry(
total=3,
backoff_factor=0.5,
status_forcelist=[500, 502, 503, 504]
)
)
self.session.mount('https://', adapter)

def normalize_license(self, license_str: Optional[str]) -> Optional[str]:
"""Normalize license string for comparison."""
if not license_str:
return None

# Convert to uppercase and remove common words and punctuation
normalized = license_str.upper().replace(' LICENSE', '').replace(' LICENCE', '').strip()

# Common substitutions
replacements = {
'APACHE 2.0': 'APACHE-2.0',
'APACHE SOFTWARE LICENSE': 'APACHE-2.0',
'BSD': 'BSD-3-CLAUSE',
'MIT LICENSE': 'MIT',
'PYTHON SOFTWARE FOUNDATION': 'PSF',
}

return replacements.get(normalized, normalized)

def get_package_license(self, package_name: str) -> Optional[str]:
"""Fetch license information from PyPI."""
if package_name in EXCEPTIONS:
return "APPROVED-EXCEPTION"

try:
response = self.session.get(f"https://pypi.org/pypi/{package_name}/json")
response.raise_for_status()
data = response.json()

license_info = (
data['info'].get('license') or
data['info'].get('classifiers', [])
)

if isinstance(license_info, list):
for classifier in license_info:
if classifier.startswith('License :: '):
parts = classifier.split(' :: ')
return parts[-1]

return license_info if isinstance(license_info, str) else None

except requests.exceptions.SSLError as e:
print(f"SSL Error fetching license for {package_name}: {e}", file=sys.stderr)
return None
except Exception as e:
print(f"Warning: Could not fetch license for {package_name}: {e}", file=sys.stderr)
return None

def extract_dependencies(self, toml_file: Path) -> List[str]:
"""Extract all dependencies from a TOML file."""
with open(toml_file, 'rb') as f:
data = tomli.load(f)

dependencies = []

# Get direct dependencies
project_deps = data.get('project', {}).get('dependencies', [])
dependencies.extend(self._parse_dependency_strings(project_deps))

# Get dev dependencies
tool_deps = data.get('tool', {}).get('uv', {}).get('dev-dependencies', [])
dependencies.extend(self._parse_dependency_strings(tool_deps))

return list(set(dependencies))

def _parse_dependency_strings(self, deps: List[str]) -> List[str]:
"""Parse dependency strings to extract package names."""
packages = []
for dep in deps:
# Skip workspace references
if dep.endswith('workspace = true}'):
continue

# Handle basic package specifiers
package = dep.split('>=')[0].split('==')[0].split('<')[0].split('>')[0].strip()
package = package.split('{')[0].strip()
packages.append(package)
return packages

def check_licenses(self, toml_file: Path) -> Dict[str, Dict[str, bool]]:
"""Check licenses for all dependencies in the TOML file."""
dependencies = self.extract_dependencies(toml_file)
results = {}
checked = set()

for package in dependencies:
if package in checked:
continue

checked.add(package)

if package in EXCEPTIONS:
results[package] = {
'license': 'Approved Exception',
'allowed': True
}
continue

license_info = self.get_package_license(package)
normalized_license = self.normalize_license(license_info)
allowed = False

if normalized_license:
allowed = (normalized_license in {self.normalize_license(l) for l in ALLOWED_LICENSES} or
package in EXCEPTIONS)

results[package] = {
'license': license_info,
'allowed': allowed
}

return results

def main():
if len(sys.argv) < 2:
print("Usage: check_licenses.py <toml_file>", file=sys.stderr)
sys.exit(1)

toml_file = Path(sys.argv[1])
checker = LicenseChecker()
results = checker.check_licenses(toml_file)

any_disallowed = False
for package, info in sorted(results.items()):
status = "✓" if info['allowed'] else "✗"
print(f"{status} {package}: {info['license']}")
if not info['allowed']:
any_disallowed = True

sys.exit(1 if any_disallowed else 0)

if __name__ == '__main__':
main()
12 changes: 12 additions & 0 deletions .github/workflows/test-events/pull_request.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"pull_request": {
"head": {
"ref": "test-branch"
},
"base": {
"ref": "main"
},
"number": 123,
"title": "test: Update dependency licenses"
}
}