Skip to content

Commit

Permalink
Merge pull request #2592 from akugarg/improve_license_detection
Browse files Browse the repository at this point in the history
Detect unknown licenses #1675

Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
  • Loading branch information
pombredanne authored Jan 8, 2022
2 parents 46a3594 + 7a88832 commit d1e725d
Show file tree
Hide file tree
Showing 289 changed files with 2,716 additions and 461 deletions.
50 changes: 36 additions & 14 deletions etc/scripts/licenses/buildrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,22 @@ def rule_exists(text):
return match.rule.identifier


def all_rule_tokens():
def all_rule_by_tokens():
"""
Return a set of tuples of tokens, one corresponding to every existing and
added rules. Used to avoid duplicates.
Return a mapping of {tuples of tokens: rule id}, with one item for each
existing and added rules. Used to avoid duplicates.
"""
rule_tokens = set()
rule_tokens = {}
for rule in models.get_rules():
rule_tokens.add(tuple(rule.tokens()))
try:
rule_tokens[tuple(rule.tokens())] = rule.identifier
except Exception as e:
df=(' file://' + rule.data_file)
tf=(' file://' + rule.text_file)
raise Exception(
f'Failed to to get tokens from rule:: {rule.identifier}\n'
f'{df}\n{tf}'
) from e
return rule_tokens


Expand Down Expand Up @@ -185,7 +193,7 @@ def cli(licenses_file):
"""

rules_data = load_data(licenses_file)
rules_tokens = all_rule_tokens()
rule_by_tokens = all_rule_by_tokens()

licenses_by_key = cache.get_licenses_db()
skinny_rules = []
Expand All @@ -205,10 +213,6 @@ def cli(licenses_file):

print()
for rule in skinny_rules:
existing = rule_exists(rule.text())
if existing:
print('Skipping existing rule:', existing, 'with text:\n', rule.text()[:50].strip(), '...')
continue

if rule.is_false_positive:
base_name = 'false-positive'
Expand All @@ -217,6 +221,21 @@ def cli(licenses_file):
else:
base_name = rule.license_expression

text = rule.text()

existing_rule = rule_exists(text)
skinny_text = ' '.join(text[:80].split())

existing_msg = (
f'Skipping rule for: {base_name!r}, '
'dupe of: {existing_rule} '
f'with text: {skinny_text!r}...'
)

if existing_rule:
print(existing_msg.format(**locals()))
continue

base_loc = find_rule_base_loc(base_name)

rd = rule.to_dict()
Expand All @@ -234,17 +253,20 @@ def cli(licenses_file):

rule_tokens = tuple(rulerec.tokens())

if rule_tokens in rules_tokens:
print('Skipping already added rule with text for:', base_name)
existing_rule = rule_by_tokens.get(rule_tokens)
if existing_rule:
print(existing_msg.format(**locals()))
continue
else:
print('Adding new rule:')
print(f'Adding new rule: {base_name}')
print(' file://' + rulerec.data_file)
print(' file://' + rulerec.text_file,)
rules_tokens.add(rule_tokens)
rulerec.dump()
models.update_ignorables(rulerec, verbose=False)
rulerec.dump()

rule_by_tokens[rule_tokens] = base_name


if __name__ == '__main__':
cli()
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/agpl-3.0-plus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ notes: |
spdx_license_key: AGPL-3.0-or-later
other_spdx_license_keys:
- AGPL-3.0+
- LicenseRef-AGPL
text_urls:
- http://www.gnu.org/licenses/agpl.txt
osi_url: http://www.opensource.org/licenses/agpl-v3.html
Expand Down
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/agpl-3.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ notes: |
spdx_license_key: AGPL-3.0-only
other_spdx_license_keys:
- AGPL-3.0
- LicenseRef-AGPL-3.0
text_urls:
- http://www.fsf.org/licensing/licenses/agpl-3.0.html
osi_url: http://www.opensource.org/licenses/agpl-v3.html
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ owner: Unspecified
notes: this is a generic entry for rare one-off AGPL extra license terms. These are typically
additional terms under section 7 of the AGPL-3.0
is_exception: yes
is_generic: yes
spdx_license_key: LicenseRef-scancode-agpl-generic-additional-terms
3 changes: 3 additions & 0 deletions src/licensedcode/data/licenses/apache-2.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ notes: |
Per SPDX.org, this version was released January 2004 This license is OSI
certified
spdx_license_key: Apache-2.0
other_spdx_license_keys:
- LicenseRef-Apache
- LicenseRef-Apache-2.0
osi_license_key: Apache-2.0
text_urls:
- http://www.apache.org/licenses/LICENSE-2.0
Expand Down
2 changes: 2 additions & 0 deletions src/licensedcode/data/licenses/commercial-license.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ short_name: Commercial License
name: Commercial License
category: Commercial
owner: Unspecified
is_generic: yes
notes: this is a generic commercial license
spdx_license_key: LicenseRef-scancode-commercial-license
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/commercial-option.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ short_name: Commercial Option
name: Commercial Option
category: Commercial
owner: Unspecified
is_generic: yes
notes: replaced by commercial-license
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/generic-cla.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
key: generic-cla
is_generic: yes
short_name: Generic CLA
name: Prior Generic Contributor License Agreement
category: Unstated License
Expand Down
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/generic-exception.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ owner: Unspecified
notes: this is a generic license exception notice. Actual terms are most commonly related to
rare, one-off extra permission to the A/L/GPL licenses
is_exception: yes
is_generic: yes
spdx_license_key: LicenseRef-scancode-generic-exception
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ category: Unstated License
owner: Unspecified
notes: this is a generic export compliance notice. Actual terms are most commonly related to
cryptography
is_generic: yes
spdx_license_key: LicenseRef-scancode-generic-export-compliance
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/generic-tos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ category: Unstated License
owner: Unspecified
notes: this is a generic license for Terms of Service such as aprivary terms and and other ToS-like
agreement found in software but that are not directly licenses.
is_generic: yes
spdx_license_key: LicenseRef-scancode-generic-tos
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/generic-trademark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ owner: Unspecified
notes: this is a generic export Trademark and name realted notice. Actual terms are most commonly
related to name use restrictions and no endorsement. This should be used only for rare one-off
notices.
is_generic: yes
spdx_license_key: LicenseRef-scancode-generic-trademark
other_spdx_license_keys:
- LicenseRef-scancode-trademark-notice
Expand Down
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/gpl-1.0-plus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ notes: Per SPDX.org, this license was released February 1989.
spdx_license_key: GPL-1.0-or-later
other_spdx_license_keys:
- GPL-1.0+
- LicenseRef-GPL
text_urls:
- http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html
other_urls:
Expand Down
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/gpl-2.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ spdx_license_key: GPL-2.0-only
other_spdx_license_keys:
- GPL-2.0
- GPL 2.0
- LicenseRef-GPL-2.0
text_urls:
- http://www.gnu.org/licenses/gpl-2.0.txt
- http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt
Expand Down
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/gpl-3.0-plus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ notes: |
spdx_license_key: GPL-3.0-or-later
other_spdx_license_keys:
- GPL-3.0+
- LicenseRef-GPL-3.0-or-later
text_urls:
- http://www.gnu.org/licenses/gpl-3.0-standalone.html
other_urls:
Expand Down
1 change: 1 addition & 0 deletions src/licensedcode/data/licenses/gpl-3.0.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ notes: |
spdx_license_key: GPL-3.0-only
other_spdx_license_keys:
- GPL-3.0
- LicenseRef-gpl-3.0
text_urls:
- http://www.gnu.org/licenses/gpl-3.0-standalone.html
- http://www.gnu.org/licenses/gpl-3.0.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ owner: Unspecified
notes: this is a generic entry for rare one-off GPL extra license terms. These are typically
additional terms under section 7 of the GPL-3.0
is_exception: yes
is_generic: yes
spdx_license_key: LicenseRef-scancode-gpl-generic-additional-terms
Loading

0 comments on commit d1e725d

Please sign in to comment.