-
Notifications
You must be signed in to change notification settings - Fork 76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Issue #31] (Team 4) Enabling positional indexing in Lucene for TEXT type #103
Changes from all commits
8412121
9d6ced4
58b4062
5ea6e24
219d0fa
1b0c1e4
0e6ec5a
e60a943
0995954
674395b
6c3ce95
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
import org.apache.lucene.document.DateTools.Resolution; | ||
import org.apache.lucene.document.Field.Store; | ||
import org.apache.lucene.index.IndexableField; | ||
import org.apache.lucene.index.IndexOptions; | ||
|
||
import edu.uci.ics.textdb.api.common.Attribute; | ||
import edu.uci.ics.textdb.api.common.FieldType; | ||
|
@@ -49,18 +50,18 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par | |
case TEXT: | ||
field = new TextField(fieldValue); | ||
break; | ||
|
||
default: | ||
break; | ||
} | ||
return field; | ||
} | ||
|
||
public static IndexableField getLuceneField(FieldType fieldType, | ||
String fieldName, Object fieldValue) { | ||
String fieldName, Object fieldValue) { | ||
IndexableField luceneField = null; | ||
switch(fieldType){ | ||
case STRING: | ||
case STRING: | ||
luceneField = new org.apache.lucene.document.StringField( | ||
fieldName, (String) fieldValue, Store.YES); | ||
break; | ||
|
@@ -78,10 +79,22 @@ public static IndexableField getLuceneField(FieldType fieldType, | |
luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES); | ||
break; | ||
case TEXT: | ||
luceneField = new org.apache.lucene.document.TextField( | ||
fieldName, (String) fieldValue, Store.YES); | ||
break; | ||
|
||
//By default we enable positional indexing in Lucene so that we can return | ||
// information about character offsets and token offsets | ||
org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add comments to the codebase: "By default we enable positional indexing in Lucene so that we can return information about character offsets and token offsets."" |
||
luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS ); | ||
luceneFieldType.setStored(true); | ||
luceneFieldType.setStoreTermVectors( true ); | ||
luceneFieldType.setStoreTermVectorOffsets( true ); | ||
luceneFieldType.setStoreTermVectorPayloads( true ); | ||
luceneFieldType.setStoreTermVectorPositions( true ); | ||
luceneFieldType.setTokenized( true ); | ||
|
||
luceneField = new org.apache.lucene.document.Field( | ||
fieldName,(String) fieldValue,luceneFieldType); | ||
|
||
break; | ||
|
||
} | ||
return luceneField; | ||
} | ||
|
@@ -96,10 +109,10 @@ public static ITuple getSpanTuple( List<IField> fieldList, List<Span> spanList, | |
IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]); | ||
return new DataTuple(spanSchema, fieldsDuplicate); | ||
} | ||
|
||
/** | ||
* | ||
* @param schema | ||
* | ||
* @param schema | ||
* @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to | ||
* the schema. SPAN_LIST_ATTRIBUTE is of type List | ||
*/ | ||
|
@@ -117,21 +130,25 @@ public static Schema createSpanSchema(Schema schema) { | |
|
||
/** | ||
* Tokenizes the query string using the given analyser | ||
* @param analyzer | ||
* @param luceneAnalyzer | ||
* @param query | ||
* @return ArrayList<String> list of results | ||
*/ | ||
public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) { | ||
public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) { | ||
HashSet<String> resultSet = new HashSet<>(); | ||
ArrayList<String> result = new ArrayList<String>(); | ||
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query)); | ||
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query)); | ||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); | ||
|
||
try{ | ||
tokenStream.reset(); | ||
while (tokenStream.incrementToken()) { | ||
String term = charTermAttribute.toString(); | ||
resultSet.add(term); | ||
String token = charTermAttribute.toString(); | ||
int tokenIndex = query.toLowerCase().indexOf(token); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we do this extra There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since tokens are converted to lower case,get the exact token from the query string. |
||
// Since tokens are converted to lower case, | ||
// get the exact token from the query string. | ||
String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length()); | ||
resultSet.add(actualQueryToken); | ||
} | ||
tokenStream.close(); | ||
} catch (Exception e) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we added one more offset, it will be good add comments to explain that "start" and "end" are character offsets. Also add an example to explain their meaning, and explain that character offsets are for "gaps," and "tokenOffset" starts from 0 (?).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.