-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproximity_parser.py
107 lines (80 loc) · 2.73 KB
/
proximity_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -----------------------------------------------------------
# This module parses a binary proximity query.
#
# The steps performed are summarized below:
# (1) Load inverted_index from file
# (2) Prompt user for query and parse it
# (3) Perform boolean retreival to obtain relevant doc_ids
# (4) Display doc_ids to user
#
#
# (C) 2020 Muhammad Bilal Akmal, 17K-3669
# -----------------------------------------------------------
import re
import filing
import token_normalizer
def _query_to_terms(query):
'''
Convert `query` to terms.
Returns the terms and the proximity as a tuple.
'''
tokens = query.split()
terms = token_normalizer.normalize_tokens(tokens)
proximity = int(re.search(r'\d+', query).group(0)) + 1
return (terms[0], terms[1], proximity)
def _is_proximity_in_doc(doc, term1, term2, proximity, inverted_index):
'''
Return `True` if two terms are `proximity` words apart in `doc`.
`False` otherwise.
'''
for position1 in inverted_index[term1][doc]:
for position2 in inverted_index[term2][doc]:
if abs(position2 - position1) == proximity:
return True
return False
def _evaluate_proximity_query(term1, term2, proximity, inverted_index):
'''
Return a set containing relevant doc_ids.
'''
# if one of the terms is not indexed return an empty set.
if not set.issubset(
set([term1, term2]),
set(inverted_index.keys())
):
return set()
result = set()
doc_ids1 = set(inverted_index[term1].keys())
doc_ids2 = set(inverted_index[term2].keys())
for doc_id in doc_ids1:
if doc_id not in doc_ids2:
continue
if _is_proximity_in_doc(
doc_id, term1, term2, proximity, inverted_index
):
result.add(doc_id)
return result
def retreive_documents(query):
'''
Retreive documents relevant to the proximity `query`.
Returns a tuple of sets containing doc_ids and filenames.
`None` if no relevant documents are found.
'''
filename = r'resources\inverted_index'
inverted_index = filing.load_python_object(filename)
filename = r'resources\doc_ids'
doc_ids = filing.load_python_object(filename)
term1, term2, proximity = _query_to_terms(query)
result = _evaluate_proximity_query(term1, term2, proximity, inverted_index)
if len(result) == 0:
return None
else:
documents = [doc_ids[doc_id] for doc_id in result]
return (result, documents)
if __name__ == '__main__':
# Ask for proximity query
query = input('Enter a proximity query: ')
result = retreive_documents(query)
if result == None:
print('No relevant speeches.')
else:
print(result)