-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIRsys.py
125 lines (109 loc) · 4.61 KB
/
IRsys.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 9 09:07:57 2018
@author: tpijkeren
"""
from whoosh import index
from whoosh.fields import *
from whoosh.query import *
import os.path
from os.path import isfile, join
from os import listdir
from whoosh.qparser import QueryParser
from bs4 import BeautifulSoup
import codecs
import frogress
import re
from whoosh import scoring
from whoosh.lang.porter import stem
#dingen van Theo
#datapath = '/mnt/HDD/latimes/'
#datapath = 'testdata/'
#ding van Kimberly
datapath = "/Users/kimberlystoutjesdijk/Desktop/index/"
laptopVanKimberlyDoetMoeilijk = True
def get_schema():
return Schema(docno=ID(unique=True, stored=True),
headline=TEXT(stored=True), path=ID(stored=True), content=TEXT)
def striptags(data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def add_doc(writer, path):
infile = open(path,"r").read()
infile = '<root>'+infile+'</root>'
if laptopVanKimberlyDoetMoeilijk:
soup = BeautifulSoup(infile, 'html.parser')
docs = soup.find_all('doc')
for doc in docs:
docno = unicode(striptags(str(doc.docno)), "utf-8")
headline = unicode(striptags(str(doc.headline)),"utf-8") # yes, this works, please dont change
text = unicode(striptags(str(doc.text)),"utf-8")
writer.add_document(docno=docno, headline=headline, path=path, content=text)
else:
soup = BeautifulSoup(infile, 'xml')
docs = soup.find_all('DOC')
for doc in docs:
docno = unicode(striptags(str(doc.DOCNO)), "utf-8")
headline = unicode(striptags(str(doc.HEADLINE)), "utf-8") # yes, this works, please dont change
text = unicode(striptags(str(doc.TEXT)), "utf-8")
writer.add_document(docno=docno, headline=headline, path=path, content=text)
def clean_index(dirname):
# Always create the index from scratch
if not os.path.exists(dirname):
os.mkdir(dirname)
print(dirname, get_schema())
ix = index.create_in(dirname, get_schema())
writer = ix.writer()
doclist = [f for f in listdir(datapath) if isfile(join(datapath, f))]
# Assume we have a function that gathers the filenames of the
# documents to be indexed
#for path in my_docs():
l = len(doclist)
#for i, filename in enumerate(doclist):
for filename in frogress.bar(doclist):
add_doc(writer, u""+datapath + filename)
writer.commit()
#==============================================================================
# schema = Schema(title=TEXT(stored=True), content=TEXT) # stored shows in results
# ix = index.create_in("index", schema)
# #ix = open_dir("index")
# writer = ix.writer()
# writer.add_document(title=u"My document", content=u"This is my document!")
# writer.add_document(title=u"Second try", content=u"This is the second example.")
# writer.add_document(title=u"Third time's the charm", content=u"Examples are many.")
# writer.commit()
#==============================================================================
#clean_index("latimes_indexed")
ix = index.open_dir("latimes_indexed")
# searcher = ix.searcher()
# parser = QueryParser("content", ix.schema)
# myquery = parser.parse(u"plants")
# results = searcher.search(myquery)
#
# print(len(results))
# for result in results:
# print(result)
# searcher.close()
#scoring.BM25F(B=0.75, K1=1.5) -> nog uitzoeken wat die parameters precies doen, in de documentation staat alleen 'see literature'
#scoring.TF_IDF()
#scoring.Frequency
queryTest = "plant"
with ix.searcher(weighting = scoring.Frequency) as searcher:
parser = QueryParser("content", ix.schema)
myquery = parser.parse(u"" + queryTest)
results = searcher.search(myquery)
print(len(results))
for result in results:
print(result)
#The results file has the format: query_id, iter, docno, rank, sim, run_id delimited by spaces.
# Query id is the query number (e.g. 136.6 or 1894, depending on the evaluation year).
# The iter constant, 0, is required but ignored by trec_eval.
# The Document numbers are string values like FR940104-0-00001 (found between <DOCNO> tags in the documents).
# The Similarity (sim) is a float value.
# Rank is an integer from 0 to 1000, which is required but ignored by the program.
# Runid is a string which gets printed out with the output.
for i in range (len(results)):
with open("results.txt", "a") as f:
f.write("query id hier " + " " + str(0) + " " + str(results[i].get("docid")) + " " + str(10-i) + " " + str(0) + " " + "bla" + "\n")
#https://whoosh.readthedocs.io/en/latest/stemming.html
#^ voor stemming, nog checken of hij dit ook doet voor het indexen