Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Issue #31] (Team 4) Enabling positional indexing in Lucene for TEXT type #103

Merged
merged 11 commits into from
May 18, 2016
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public boolean equals(Object obj) {
if (list == null) {
if (other.list != null)
return false;
} else if (!list.equals(other.list))
} else if ( !(list.containsAll(other.list) & other.list.containsAll(list)))
return false;
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,41 @@
public class Span {
//The name of the field (in the tuple) where this span is present
private String fieldName;
//The start of the span
//The start position of the span, which is the offset of the gap before the first character of the span.
private int start;
//The end of the span
//The end position of the span, which is the offset of the gap after the last character of the span.
private int end;
//The key we are searching for eg: regex
private String key;
//The value matching the key
private String value;


public Span(String fieldName, int start, int end, String key, String value) {
// The token position of the span, starting from 0.
private int tokenOffset;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we added one more offset, it will be good add comments to explain that "start" and "end" are character offsets. Also add an example to explain their meaning, and explain that character offsets are for "gaps," and "tokenOffset" starts from 0 (?).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


/*
Example:
Value = "The quick brown fox jumps over the lazy dog"
Now the Span for brown should be
start = 10 : index Of character 'b'
end = 15 : index of character 'n'+ 1 OR start+length
Both of then result in same values.
tokenOffset = 2 position of word 'brown'
*/

public static int INVALID_TOKEN_OFFSET = -1;

public Span(String fieldName, int start, int end, String key, String value){
this.fieldName = fieldName;
this.start = start;
this.end = end;
this.key = key;
this.value = value;
this.tokenOffset = INVALID_TOKEN_OFFSET;
}

public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) {
this(fieldName, start, end, key, value);
this.tokenOffset = tokenOffset;
}

public String getFieldName() {
Expand All @@ -41,6 +60,8 @@ public int getEnd() {
return end;
}

public int getTokenOffset(){return tokenOffset;}

@Override
public int hashCode() {
final int prime = 31;
Expand All @@ -51,6 +72,7 @@ public int hashCode() {
result = prime * result + ((key == null) ? 0 : key.hashCode());
result = prime * result + start;
result = prime * result + ((value == null) ? 0 : value.hashCode());
result = prime * result + tokenOffset;
return result;
}

Expand Down Expand Up @@ -87,7 +109,10 @@ public boolean equals(Object obj) {
return false;
} else if (!value.equals(other.value))
return false;


if(tokenOffset!= other.tokenOffset)
return false;

return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.apache.lucene.document.DateTools.Resolution;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexOptions;

import edu.uci.ics.textdb.api.common.Attribute;
import edu.uci.ics.textdb.api.common.FieldType;
Expand Down Expand Up @@ -49,18 +50,18 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par
case TEXT:
field = new TextField(fieldValue);
break;

default:
break;
}
return field;
}

public static IndexableField getLuceneField(FieldType fieldType,
String fieldName, Object fieldValue) {
String fieldName, Object fieldValue) {
IndexableField luceneField = null;
switch(fieldType){
case STRING:
case STRING:
luceneField = new org.apache.lucene.document.StringField(
fieldName, (String) fieldValue, Store.YES);
break;
Expand All @@ -78,10 +79,22 @@ public static IndexableField getLuceneField(FieldType fieldType,
luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES);
break;
case TEXT:
luceneField = new org.apache.lucene.document.TextField(
fieldName, (String) fieldValue, Store.YES);
break;

//By default we enable positional indexing in Lucene so that we can return
// information about character offsets and token offsets
org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comments to the codebase: "By default we enable positional indexing in Lucene so that we can return information about character offsets and token offsets.""

luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS );
luceneFieldType.setStored(true);
luceneFieldType.setStoreTermVectors( true );
luceneFieldType.setStoreTermVectorOffsets( true );
luceneFieldType.setStoreTermVectorPayloads( true );
luceneFieldType.setStoreTermVectorPositions( true );
luceneFieldType.setTokenized( true );

luceneField = new org.apache.lucene.document.Field(
fieldName,(String) fieldValue,luceneFieldType);

break;

}
return luceneField;
}
Expand All @@ -96,10 +109,10 @@ public static ITuple getSpanTuple( List<IField> fieldList, List<Span> spanList,
IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]);
return new DataTuple(spanSchema, fieldsDuplicate);
}

/**
*
* @param schema
*
* @param schema
* @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to
* the schema. SPAN_LIST_ATTRIBUTE is of type List
*/
Expand All @@ -117,21 +130,25 @@ public static Schema createSpanSchema(Schema schema) {

/**
* Tokenizes the query string using the given analyser
* @param analyzer
* @param luceneAnalyzer
* @param query
* @return ArrayList<String> list of results
*/
public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) {
public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) {
HashSet<String> resultSet = new HashSet<>();
ArrayList<String> result = new ArrayList<String>();
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query));
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

try{
tokenStream.reset();
while (tokenStream.incrementToken()) {
String term = charTermAttribute.toString();
resultSet.add(term);
String token = charTermAttribute.toString();
int tokenIndex = query.toLowerCase().indexOf(token);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we do this extra substring() step? Add comments here and to the codebase.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since tokens are converted to lower case,get the exact token from the query string.

// Since tokens are converted to lower case,
// get the exact token from the query string.
String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length());
resultSet.add(actualQueryToken);
}
tokenStream.close();
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ public class DictionaryPredicate implements IPredicate {
New and York; if searched in String field we search for Exact string.
*/

public DictionaryPredicate(IDictionary dictionary, Analyzer analyzer, List<Attribute> attributeList,
public DictionaryPredicate(IDictionary dictionary, Analyzer luceneAnalyzer, List<Attribute> attributeList,
SourceOperatorType srcOpType, IDataStore dataStore) {

this.dictionary = dictionary;
this.luceneAnalyzer = analyzer;
this.luceneAnalyzer = luceneAnalyzer;
this.attributeList = attributeList;
this.srcOpType = srcOpType;
this.dataStore = dataStore;
Expand Down Expand Up @@ -72,7 +72,7 @@ public Analyzer getAnalyzer() {
public IOperator getScanSourceOperator() throws ParseException, DataFlowException {
QueryParser luceneQueryParser = new QueryParser(attributeList.get(0).getFieldName(), luceneAnalyzer);
Query luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY);
IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery);
IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,luceneAnalyzer,attributeList);
IDataReader dataReader = new DataReader(dataReaderPredicate);

IOperator operator = new ScanBasedSourceOperator(dataReader);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,18 @@ public class KeywordPredicate implements IPredicate{
private final String query;
private final Query luceneQuery;
private ArrayList<String> tokens;
private Analyzer analyzer;
private Analyzer luceneAnalyzer;
private IDataStore dataStore;

/*
query refers to string of keywords to search for.
For Ex. New york if searched in TextField, we would consider both tokens
New and York; if searched in String field we search for Exact string.
*/
public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer analyzer,IDataStore dataStore ) throws DataFlowException{
public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer luceneAnalyzer, IDataStore dataStore ) throws DataFlowException{
try {
this.query = query;
this.tokens = Utils.tokenizeQuery(analyzer, query);
this.tokens = Utils.tokenizeQuery(luceneAnalyzer, query);
this.attributeList = attributeList;
this.dataStore = dataStore;
String[] temp = new String[attributeList.size()];
Expand All @@ -54,7 +54,7 @@ public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer an
temp[i] = attributeList.get(i).getFieldName();
}
this.fields = temp;
this.analyzer = analyzer;
this.luceneAnalyzer = luceneAnalyzer;
this.luceneQuery = createLuceneQueryObject();
} catch (Exception e) {
e.printStackTrace();
Expand Down Expand Up @@ -105,7 +105,7 @@ and generate boolean query (Textfield is Case Insensitive)
*/
String[] remainingTextFields = (String[]) textFieldList.toArray(new String[0]);
BooleanQuery queryOnTextFields = new BooleanQuery();
MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, analyzer);
MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, luceneAnalyzer);

for(String searchToken : this.tokens){
Query termQuery = parser.parse(searchToken);
Expand All @@ -126,16 +126,18 @@ public String getQuery(){
public List<Attribute> getAttributeList() {
return attributeList;
}

public Query getQueryObject(){return this.luceneQuery;}

public ArrayList<String> getTokens(){return this.tokens;}

public Analyzer getAnalyzer(){
return analyzer;
public Analyzer getLuceneAnalyzer(){
return luceneAnalyzer;
}

public DataReaderPredicate getDataReaderPredicate() {
DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery);
DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery,
this.query, this.luceneAnalyzer, this.attributeList);
return dataReaderPredicate;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ public class RegexPredicate implements IPredicate {

private String regex;
private List<String> fieldNameList;

private List<Attribute> attributeList;
private Analyzer luceneAnalyzer;
private IDataStore dataStore;

public RegexPredicate(String regex, List<Attribute> attributeList, Analyzer analyzer, IDataStore dataStore) {
this.regex = regex;
this.luceneAnalyzer = analyzer;
this.dataStore = dataStore;
this.attributeList = attributeList;
this.fieldNameList = attributeList.stream()
.filter(attr -> (attr.getFieldType() == FieldType.TEXT || attr.getFieldType() == FieldType.STRING))
.map(attr -> attr.getFieldName()).collect(Collectors.toList());
Expand All @@ -52,4 +53,8 @@ public List<String> getFieldNameList() {
return this.fieldNameList;
}

public List<Attribute> getAttributeList() {
return attributeList;
}

}
Loading