diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py index e71dd7a3eb3..0e51c78d8a0 100644 --- a/src/licensedcode/match.py +++ b/src/licensedcode/match.py @@ -386,7 +386,7 @@ def qcontains_stopwords(self): stopwords_pos = qspan & query.stopwords_span stopwords_pos = (pos for pos in stopwords_pos if pos != qspe) qry_stopxpos = query.stopwords_by_pos - return any(qry_stopxpos[pos] for pos in stopwords_pos) + return any(qry_stopxpos.get(pos, 0) for pos in stopwords_pos) def qrange(self): """ @@ -1217,8 +1217,12 @@ def filter_low_score(matches, min_score=100): return kept, discarded -def filter_spurious_single_token(matches, query=None, unknown_count=5, - trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN): +def filter_spurious_single_token( + matches, + query=None, + unknown_count=5, + trace=TRACE_FILTER_SPURIOUS_SINGLE_TOKEN, +): """ Return a filtered list of kept LicenseMatch matches and a list of discardable matches given a `matches` list of LicenseMatch by removing @@ -1250,9 +1254,10 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5, qend = match.qend # compute the number of unknown tokens before and after this single - # matched position note: unknowns_by_pos is a defaultdict(int), - # shorts_and_digits is a set of integers - before = unknowns_by_pos[qstart - 1] + # matched position note: + # - unknowns_by_pos is a dict, + # - shorts_and_digits is a set of ints + before = unknowns_by_pos.get(qstart - 1, 0) for p in range(qstart - 1 - unknown_count, qstart): if p in shorts_and_digits: before += 1 @@ -1263,7 +1268,7 @@ def filter_spurious_single_token(matches, query=None, unknown_count=5, kept.append(match) continue - after = unknowns_by_pos[qstart] + after = unknowns_by_pos.get(qstart, 0) for p in range(qend, qend + 1 + unknown_count): if p in shorts_and_digits: after += 1 diff --git a/src/licensedcode/query.py b/src/licensedcode/query.py index c1c02e3348a..31264784ce8 100644 --- a/src/licensedcode/query.py +++ b/src/licensedcode/query.py @@ -228,7 +228,7 @@ def __init__( # index of "known positions" (yes really!) to a number of unknown tokens # after that known position. For unknowns at the start, the position is # using the magic -1 key - self.unknowns_by_pos = defaultdict(int) + self.unknowns_by_pos = {} # Span of "known positions" (yes really!) followed by unknown token(s) self.unknowns_span = None @@ -236,7 +236,7 @@ def __init__( # index of "known positions" (yes really!) to a number of stopword # tokens after that known position. For stopwords at the start, the # position is using the magic -1 key - self.stopwords_by_pos = defaultdict(int) + self.stopwords_by_pos = {} # Span of "known positions" (yes really!) followed by stopwords self.stopwords_span = None @@ -355,12 +355,12 @@ def tokens_with_unknowns(self): """ unknowns = self.unknowns_by_pos # yield anything at the start - for _ in range(unknowns[-1]): + for _ in range(unknowns.get(-1, 0)): yield None for pos, token in enumerate(self.tokens): yield token - for _ in range(unknowns[pos]): + for _ in range(unknowns.get(pos, 0)): yield None def tokens_by_line( @@ -386,11 +386,13 @@ def tokens_by_line( # bind frequently called functions to local scope line_by_pos_append = self.line_by_pos.append - self_unknowns_by_pos = self.unknowns_by_pos + # we use a defaultdict as a convenience at construction time + unknowns_by_pos = defaultdict(int) unknowns_pos = set() unknowns_pos_add = unknowns_pos.add - self_stopwords_by_pos = self.stopwords_by_pos + # we use a defaultdict as a convenience at construction time + stopwords_by_pos = defaultdict(int) stopwords_pos = set() stopwords_pos_add = stopwords_pos.add @@ -443,11 +445,11 @@ def tokens_by_line( # If we have not yet started globally, then all tokens # seen so far are stopwords and we keep a count of them # in the magic "-1" position. - self_stopwords_by_pos[-1] += 1 + stopwords_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos - self_stopwords_by_pos[known_pos] += 1 + stopwords_by_pos[known_pos] += 1 stopwords_pos_add(known_pos) # we do not track stopwords, only their position continue @@ -456,11 +458,11 @@ def tokens_by_line( # If we have not yet started globally, then all tokens # seen so far are unknowns and we keep a count of them # in the magic "-1" position. - self_unknowns_by_pos[-1] += 1 + unknowns_by_pos[-1] += 1 else: # here we have a new unknwon token positioned right after # the current known_pos - self_unknowns_by_pos[known_pos] += 1 + unknowns_by_pos[known_pos] += 1 unknowns_pos_add(known_pos) line_tokens_append(tid) @@ -492,11 +494,14 @@ def tokens_by_line( yield line_tokens - # finally create a Span of positions followed by unkwnons and another - # for positions followed by stopwords used for intersection with the - # query span to do the scoring matches correctly + # finally update the attributes and create a Span of positions followed + # by unkwnons and another for positions followed by stopwords used for + # intersection with the query span to do the scoring matches correctly self.unknowns_span = Span(unknowns_pos) self.stopwords_span = Span(stopwords_pos) + # also convert the defaultdicts back to plain discts + self.unknowns_by_pos = dict(unknowns_by_pos) + self.stopwords_by_pos = dict(stopwords_by_pos) def tokenize_and_build_runs(self, tokens_by_line, line_threshold=4): """ @@ -760,14 +765,14 @@ def tokens_with_unknowns(self): unknowns = self.query.unknowns_by_pos # yield anything at the start only if this is the first query run if self.start == 0: - for _ in range(unknowns[-1]): + for _ in range(unknowns.get(-1, 0)): yield None for pos, token in self.tokens_with_pos(): yield token if pos == self.end: break - for _ in range(unknowns[pos]): + for _ in range(unknowns.get(pos, 0)): yield None def tokens_with_pos(self): diff --git a/tests/licensedcode/data/query/unknown_positions/lz4.license.txt b/tests/licensedcode/data/query/unknown_positions/lz4.license.txt new file mode 100644 index 00000000000..c221aebd26e --- /dev/null +++ b/tests/licensedcode/data/query/unknown_positions/lz4.license.txt @@ -0,0 +1,11 @@ +This repository uses 2 different licenses : +- all files in the `lib` directory use a BSD 2-Clause license +- all other files use a GPLv2 license, unless explicitly stated otherwise + +Relevant license is reminded at the top of each source file, +and with presence of COPYING or LICENSE file in associated directories. + +This model is selected to emphasize that +files in the `lib` directory are designed to be included into 3rd party applications, +while all other files, in `programs`, `tests` or `examples`, +receive more limited attention and support for such scenario. diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py index bf6d34cfdbf..17b2ca563e9 100644 --- a/tests/licensedcode/test_query.py +++ b/tests/licensedcode/test_query.py @@ -9,6 +9,7 @@ import json import os +from collections import defaultdict from commoncode.testcase import FileBasedTesting from licensedcode import cache @@ -395,6 +396,31 @@ def test_query_run_unknowns(self): expected = {-1: 2, 0: 4, 1: 3} assert dict(q.unknowns_by_pos) == expected + def test_query_unknowns_by_pos_and_stopwords_are_not_defaultdic_and_not_changed_on_query(self): + idx = index.LicenseIndex( + [Rule(stored_text='a is the binary')], + _legalese=set(['binary']), + _spdx_tokens=set() + ) + q = Query(query_string='binary that was a binary', idx=idx) + list(q.tokens_by_line()) + assert q.unknowns_by_pos == {0: 2} + assert q.stopwords_by_pos == {0: 1} + + assert not isinstance(q.unknowns_by_pos, defaultdict) + assert not isinstance(q.stopwords_by_pos, defaultdict) + + try: + q.unknowns_by_pos[1] + assert q.unknowns_by_pos == {0: 2} + except KeyError: + pass + try: + q.stopwords_by_pos[1] + assert q.stopwords_by_pos == {0: 1} + except KeyError: + pass + class TestQueryWithMultipleRuns(IndexTesting): @@ -768,3 +794,52 @@ def test_query_run_for_text_with_long_lines(self): idx = cache.get_index() assert len(Query(location1, idx=idx).query_runs) == 17 assert len(Query(location2, idx=idx).query_runs) == 15 + + def test_match_does_not_change_query_unknown_positions(self): + from licensedcode.match import LicenseMatch + from licensedcode.spans import Span + + location = self.get_test_loc('query/unknown_positions/lz4.license.txt') + idx = cache.get_index() + # build a query first + qry1 = Query(location, idx=idx) + # this has the side effect to populate the unknown + txt = u' '.join(f'{i}-{idx.tokens_by_tid[t]}' for i, t in enumerate(qry1.tokens)) + assert txt == ( + '0-this 1-repository 2-uses 3-2 4-different 5-licenses ' + '6-all 7-files 8-in 9-the 10-lib 11-directory 12-use 13-bsd 14-2 15-clause 16-license ' + '17-all 18-other 19-files 20-use 21-gplv2 22-license 23-unless 24-explicitly 25-stated 26-otherwise ' + '27-relevant 28-license 29-is 30-reminded 31-at 32-the 33-top 34-of 35-each 36-source 37-file ' + '38-and 39-with 40-presence 41-of 42-copying 43-or 44-license 45-file 46-in 47-associated 48-directories ' + '49-this 50-model 51-is 52-selected 53-to 54-emphasize 55-that ' + '56-files 57-in 58-the 59-lib 60-directory 61-are 62-designed 63-to 64-be 65-included 66-into 67-3rd 68-party 69-applications ' + '70-while 71-all 72-other 73-files 74-in 75-programs 76-tests 77-or 78-examples ' + '79-receive 80-more 81-limited 82-attention 83-and 84-support 85-for 86-such 87-scenario' + ) + list(qry1.tokens_by_line()) + assert qry1.unknowns_by_pos == {} + + # run matching + matches = idx.match(location=location) + match = matches[0] + + rule = [ + r for r in idx.rules_by_rid + if r.identifier == 'bsd-simplified_and_gpl-2.0_1.RULE' + ][0] + + expected = LicenseMatch( + matcher='2-aho', + rule=rule, + qspan=Span(0, 48), + ispan=Span(0, 48), + ) + + assert match == expected + + # check that query unknown by pos is the same and empty + qry2 = match.query + + # this was incorrectly returned as {15: 0, 20: 0, 21: 0, 41: 0, 43: 0} + # after querying done during matching + assert qry2.unknowns_by_pos == {}