-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathindexer.cpp
502 lines (450 loc) · 13.4 KB
/
indexer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
#include "indexer.h"
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <fstream>
#include <cstring>
// CONSTRUCTOR
Indexer::Indexer()
{
indexer_ = NULL;
}
// DESTRUCTOR
Indexer::~Indexer()
{
if (indexer_)
delete indexer_;
}
// INSERT A KEYWORD INTO THE INDEXER
void Indexer::insertKey(const string & keyword, Document & docname)
{
indexer_ = insertKey(indexer_, keyword, docname);
}
INode * Indexer::insertKey(INode * node, const string &keyword, Document & docname)
{
//If the tree is empty, create a new node for it
if (node == NULL)
{
INode * new_node = new INode(keyword);
new_node->data()->docs(docname);
return new_node;
}
//If keyword is smaller than current node key, set it to the left
if (std::strcmp(keyword.c_str(), node->data()->word_.c_str()) < 0)
{
node->setLeft(insertKey(node->left(), keyword, docname));
return reBalance(node);
}
//If keyword is greater than current node key, set it to the right
else if (std::strcmp(keyword.c_str(), node->data()->word_.c_str()) > 0)
{
node->setRight(insertKey(node->right(), keyword, docname));
return reBalance(node);
}
//If keyword has already exist, return the node
node->data()->docs(docname);
return node;
}
// REBALANCE THE INDEXER'S STRUCTURE
INode * Indexer::reBalance(INode *node)
{
if (node == NULL)
return node;
int leftHeight = INode::getHeight(node->left());
int rightHeight = INode::getHeight(node->right());
if (abs(leftHeight - rightHeight) < 2)
return node;
// Check if we need to rotate the node right or left
bool rotateRight = leftHeight > rightHeight;
// We rotate the node right when left height is larger than right height
if (rotateRight)
{
INode * left = node->left();
INode * lChild = left->left();
INode * rChild = left->right();
//Left-left case
if (INode::getHeight(lChild) > INode::getHeight(rChild))
{
node->setLeft(rChild);
left->setRight(node);
node = left;
}
//Left-right case
else
{
node->setLeft(rChild->right());
left->setRight(rChild->left());
rChild->setLeft(left);
rChild->setRight(node);
node = rChild;
}
}
// Rotate the node left when left height is smaller than right height
else
{
INode * right = node->right();
INode * lChild = right->left();
INode * rChild = right->right();
//Right-right case
if (INode::getHeight(rChild) > INode::getHeight(lChild))
{
node->setRight(right->left());
right->setLeft(node);
node = right;
}
//Right-left case
else
{
node->setRight(lChild->left());
right->setLeft(lChild->right());
lChild->setLeft(node);
lChild->setRight(right);
node = lChild;
}
}
return node;
}
// FIND A NODE IN THE INDEXER CONTAIN PROVIDED KEYWORD
INode *Indexer::at(const string &keyword)
{
INode * ptr = indexer_;
while (ptr != NULL && keyword != ptr->data()->word())
{
if (keyword < ptr->data()->word())
ptr = ptr->left();
else
ptr = ptr->right();
}
return ptr;
}
// SET A QUERY INTO THE INDEXER
void Indexer::setQuery(string &query)
{
// Delete all trailing white spaces
string::iterator it = query.end() - 1;
for (;it != query.begin() && *it == ' '; it--);
query.resize(it + 1 - query.begin());
stringstream strm;
strm.str(query); // Init the string stream with the query string
string temp;
unsigned int i = 0;
// Flush out the last query
query_.clear();
// Change status code to default SUCCESS value
status_ = SUCCESS;
// Parse through the string stream
while (! strm.eof())
{
strm >> temp; // flow it into buffer memory
query_.push_back(temp);
if (i % 2 == 0 && (query_.at(i) == "AND" || query_.at(i) == "OR")) // if it is not operand as it should be
{
status_ = SYNTAX_ERROR; // raise an error
break; // break the process
}
else if (i % 2 == 1 && (query_.at(i) != "AND" && query_.at(i) != "OR")) // if it is not operator as it should be
{
query_.push_back(query_.at(i));
query_.at(i++) = "AND";
}
i++;
}
if (i % 2 == 0) // If the last unit is an operand
{
status_ = SYNTAX_ERROR; // raise an error
return;
}
for (i = 0; i < query_.size(); i = i + 2)
{
if (!filter(query_.at(i), true))
{
status_ = STOPWORD_WARNING; // raise an warning
if (query_.size() == 1)
{
query_.erase(query_.begin());
break;
}
else if (i == 0)
{
query_.erase(query_.begin());
query_.erase(query_.begin());
i = i - 2;
}
else if (i == query_.size() - 1)
{
query_.pop_back();
query_.pop_back();
break;
}
else
{
query_.erase(query_.begin() + i);
if (query_.at(i-1) == "AND" && query_.at(i) == "OR")
query_.erase(query_.begin() + i - 1);
else
query_.erase(query_.begin() + i);
i = i - 2;
}
}
}
}
// EXECUTE THE QUERY
void Indexer::execute()
{
if (status_ != SYNTAX_ERROR && query_.size() > 0)
{
stack<string> s_operator; // A stack used to store operator(s)
stack< vector<Document> > s_operand; // A stack used to store operand(s)
vector<Document> d1, d2;
string buffer;
// Parse through the query stack
while (query_.size() > 0)
{
// Cache the top element
buffer = query_.back();
query_.pop_back();
// If buffer contain operator AND - highest priority
if (buffer == "AND")
{
// push it into operator stack right away
s_operator.push(buffer);
}
// else, it is operator OR
else if (buffer == "OR")
{
// loop while last operator has higher priority (AND operator)
while (!s_operator.empty() && s_operator.top() == "AND")
{
// we pop the AND operator out
s_operator.pop();
d1 = s_operand.top();
s_operand.pop();
d2 = s_operand.top();
s_operand.pop();
// and do the math
result_ = Document::conjunct(d1, d2);
// push the result into the operand stack as a normal operand
s_operand.push(result_);
}
// push OR into operator stack
s_operator.push(buffer);
}
// else, it is an operand
else
{
// If the keyword is a wildcard
if (buffer.find("*") != string::npos)
// push the wildcard filter result into operand stack
s_operand.push(Indexer::match(indexer_, buffer));
// otherwise,
else
// just push it into operand stack for later process
s_operand.push((*this)[buffer]);
}
}
// Evaluation
while (!s_operator.empty())
{
// If the operator is AND
if (s_operator.top() == "AND")
{
// take out the 2 next operands in operand stack
s_operator.pop();
d1 = s_operand.top();
s_operand.pop();
d2 = s_operand.top();
s_operand.pop();
// do the conjunct operation
result_ = Document::conjunct(d1, d2);
// and then push the result back into operand stack
s_operand.push(result_);
}
// otherwise
else
{
s_operator.pop();
d1 = s_operand.top();
s_operand.pop();
d2 = s_operand.top();
s_operand.pop();
// do the disjunct operation
result_ = Document::disjunct(d1, d2);
// and then push the result back into operand stack
s_operand.push(result_);
}
}
// The last result is the last element left in operand stack
if (!s_operand.empty())
result_ = s_operand.top();
}
else
result_.clear();
}
vector<Document> Indexer::result()
{
// Sort result by score before return it out
sort(result_.begin(), result_.end(), Document::docFreqComp);
return result_;
}
// INDEX A DOCUMENT
bool Indexer::addDocument(const string &docname)
{
// Open document
ifstream docfile;
docfile.open(docname.c_str());
//Association value for keyword
Document doc;
doc.name(docname);
string keyword;
if (docfile.is_open())
{
// Check if the file is text file or not
int c;
while ( (c = docfile.get()) != EOF && c <= 127 );
if (c == EOF)
{
docfile.clear();
docfile.seekg(0, ios::beg);
while (docfile.good())
{
// Read each word one by one in the document
docfile >> keyword;
// Filter the keyword and if it's still valid, we continue indexing
if (filter(keyword, false))
this->insertKey(keyword, doc);
}
}
else
{
return false; // Indicate the file is not read
}
}
docfile.close();
return true; // Indicate successfully reading
}
// OVERLOADING OPERATOR [] FOR FAST ACCESS NODE VALUE BY KEY WORD
vector<Document> Indexer::operator[](const string &keyword)
{
vector<Document> result;
INode * keynode = this->at(keyword);
// If the keyword exists, return all documents containing it
if (keynode != NULL)
{
keynode->data()->sortDocs();
result = keynode->data()->docs();
}
// return empty result by default
return result;
}
// REMOVE UNNECESSARY CHARACTER FROM THE KEYWORD AND CHECK ITS VALIDITY
bool Indexer::isGarbage(char c)
{
return !( (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') );
}
bool Indexer::filter(string &keyword, bool isQuery)
{
// Lower all characters and remove unnecessary one
string::iterator it = keyword.begin();
while (!keyword.empty() && it < keyword.end())
{
*it = tolower(*it);
if (isQuery)
if (*it == '*')
{
it++;
continue;
}
if (isGarbage(*it))
{
keyword.erase(it);
continue;
}
it++;
}
if (keyword.size() == 0 || stopwords_.find(keyword) != stopwords_.end())
return false;
return true;
}
// RETRIEVE STOP WORDS FROM FILE
void Indexer::indexStopWords(const string &wordfile)
{
ifstream file;
file.open(wordfile.c_str());
string stopword;
if (file.is_open())
{
while (file.good())
{
// Read words one by one then insert them to the set
file >> stopword;
stopwords_.insert(stopword);
}
}
}
// PROVIDE ACCESS TO INDEXER DATA
INode *Indexer::indexer()
{
return indexer_;
}
Indexer::Status Indexer::status()
{
return status_;
}
// HANDLE THE REGEX STRING TO TURN IT INTO A SPECIFIC PATTERN OBJECT. MAKE THE TASK MORE COMFORTABLE.
vector<Document> Indexer::match(INode *node, string regex)
{
int pos; // An anchor to determine the position of '*' character
vector<string> pattern; // A minified object used to store regex string in an easy-to-handle format
while ((pos = regex.find("**")) != string::npos) // Standardize the wildcard string
{
regex.replace(pos, 2, "*");
}
while ((pos = regex.find('*')) != string::npos) // Parse through regex string and use '*' as delimeter
{
pattern.push_back(regex.substr(0, pos)); // to cut regex string into small parts used in pattern object
regex = regex.substr(pos + 1);
}
pattern.push_back(regex); // Push the last of regex string into the object to complete the pattern
return Indexer::match(node, pattern);
}
// FIND ALL INDEX WITH RIGHT FORMAT
vector<Document> Indexer::match(INode *node, vector<string> pattern)
{
vector<Document> result;
if (node == NULL) // If the node is empty
return result; // return an empty result
string word = node->data()->word(); // Cache out the INode word data value
int pos;
// Post-order traverse
result = Document::disjunct(Indexer::match(node->left(), pattern), Indexer::match(node->right(), pattern));
string temp(pattern.at(0));
if ((pos = word.find(temp)) == string::npos || pos != 0) // If the first pattern is not at the start of the word
{
return result; // return
}
else // else, the first pattern match the word
{
word = word.substr(temp.size()); // cut the first matching part out of the word
}
temp = pattern.back();
if ((pos = word.rfind(temp)) == string::npos || (unsigned) pos != word.size() - temp.size()) // If the last pattern is not at the end of the word
{
return result; // return
}
else // else, the last pattern match the word
{
word = word.substr(0, pos); // cut the last matching part out of the word
}
// Else, we move on and test the pattern in the middle of the word
for (unsigned int i = 1; i < pattern.size() - 1; i++)
{
if ((pos = word.find(pattern.at(i))) != string::npos) // If we find the pattern
word = word.substr(pos + 1); // cut the matching part out of the word
else // else,
return result; // return
}
node->data()->sortDocs();
result = Document::disjunct(result, node->data()->docs()); // append the documents containing it into the result
return result;
}