-
Notifications
You must be signed in to change notification settings - Fork 76
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Issue #31] (Team4) Keyword Operator #85
Changes from 9 commits
29bef31
b6affd2
5e64dce
ce33cc3
00f1b4d
4487b20
31f12ed
c103be7
f5f58f6
712bcdd
2d8afe6
18b19b6
7e3619f
17b42d6
5f1f044
ed297d7
eb74d2f
8fb8d92
db4a530
774f5da
b4ec071
7671fbf
05315cb
3b2927c
8ddf519
5ba14f6
1357ac2
9bdf5d0
43d9604
a782b1f
ea5d6c0
11bff36
c0e4ba1
365285e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
package edu.uci.ics.textdb.dataflow.common; | ||
|
||
import java.io.StringReader; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
import edu.uci.ics.textdb.api.common.Attribute; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | ||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | ||
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; | ||
import org.apache.lucene.queryparser.classic.ParseException; | ||
import org.apache.lucene.queryparser.classic.QueryParser; | ||
import org.apache.lucene.search.BooleanClause; | ||
import org.apache.lucene.search.BooleanQuery; | ||
import org.apache.lucene.search.Query; | ||
|
||
import edu.uci.ics.textdb.api.common.IField; | ||
import edu.uci.ics.textdb.api.common.IPredicate; | ||
import edu.uci.ics.textdb.api.common.ITuple; | ||
import edu.uci.ics.textdb.common.exception.DataFlowException; | ||
import edu.uci.ics.textdb.common.field.StringField; | ||
|
||
/** | ||
* @author prakul | ||
* @author akshay | ||
*/ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add high-level comments to explain the purpose of this class. |
||
public class KeywordPredicate implements IPredicate{ | ||
|
||
private final List<Attribute> attributeList; | ||
private final String[] fields; | ||
private final String query; | ||
private final Query queryObject; | ||
private ArrayList<String> tokens; | ||
private Analyzer analyzer; | ||
|
||
public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer analyzer ) throws DataFlowException{ | ||
try { | ||
this.query = query; | ||
this.attributeList = attributeList; | ||
String[] temp = new String[attributeList.size()]; | ||
|
||
for (int i=0;i< attributeList.size();i++){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add necessary spaces to "int i=0;i< attributeList.size();i++)". |
||
temp[i] = attributeList.get(i).getFieldName(); | ||
} | ||
this.fields = temp; | ||
this.tokens = queryTokenizer(analyzer, this.query); | ||
this.analyzer = analyzer; | ||
this.queryObject = createQueryObject(); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
throw new DataFlowException(e.getMessage(), e); | ||
} | ||
} | ||
|
||
@Override | ||
public boolean satisfy(ITuple tuple) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a "TODO" here? |
||
return true; | ||
} | ||
|
||
|
||
|
||
private Query createQueryObject() throws ParseException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use an example to explain the purpose of this function. |
||
BooleanQuery booleanQuery = new BooleanQuery(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My Intellij IDE shows that "BooleanQuery" is deprecated. |
||
MultiFieldQueryParser parser = new MultiFieldQueryParser(this.fields, this.analyzer); | ||
for(String searchToken: this.tokens){ | ||
Query termQuery = parser.parse(searchToken); | ||
booleanQuery.add(termQuery, BooleanClause.Occur.MUST); | ||
} | ||
return booleanQuery; | ||
} | ||
|
||
|
||
public String getQuery(){ | ||
return query; | ||
} | ||
|
||
public List<Attribute> getAttributeList() { | ||
return attributeList; | ||
} | ||
public Query getQueryObject(){return this.queryObject;} | ||
|
||
public ArrayList<String> getTokens(){return this.tokens;} | ||
|
||
public Analyzer getAnalyzer(){ | ||
return analyzer; | ||
} | ||
|
||
public ArrayList<String> queryTokenizer(Analyzer analyzer, String query) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function name should start with a verb. Add high-level comments to explain this function. |
||
|
||
ArrayList<String> result = new ArrayList<String>(); | ||
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(query)); | ||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); | ||
|
||
try{ | ||
tokenStream.reset(); | ||
while (tokenStream.incrementToken()) { | ||
String term = charTermAttribute.toString(); | ||
result.add(term); | ||
} | ||
tokenStream.close(); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
} | ||
|
||
return result; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
package edu.uci.ics.textdb.dataflow.keywordmatch; | ||
|
||
import edu.uci.ics.textdb.api.common.*; | ||
import edu.uci.ics.textdb.common.constants.SchemaConstants; | ||
import edu.uci.ics.textdb.api.common.Schema; | ||
import edu.uci.ics.textdb.common.field.Span; | ||
import edu.uci.ics.textdb.common.field.StringField; | ||
import edu.uci.ics.textdb.common.field.TextField; | ||
import edu.uci.ics.textdb.dataflow.common.KeywordPredicate; | ||
import org.apache.lucene.search.Query; | ||
import edu.uci.ics.textdb.common.field.ListField; | ||
import edu.uci.ics.textdb.common.field.DataTuple; | ||
|
||
import edu.uci.ics.textdb.api.dataflow.IOperator; | ||
import edu.uci.ics.textdb.api.dataflow.ISourceOperator; | ||
import edu.uci.ics.textdb.common.exception.DataFlowException; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
/** | ||
* @author prakul | ||
* | ||
*/ | ||
public class KeywordMatcher implements IOperator { | ||
private final KeywordPredicate predicate; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done
|
||
private ISourceOperator sourceOperator; | ||
private Query luceneQuery; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This variable ls never used? |
||
|
||
private String regex; | ||
private Pattern pattern; | ||
private ArrayList<Pattern> patternList; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "patternList" -> "tokenPatternList"? |
||
private Matcher matcher; | ||
private List<Span> spanList; | ||
private Schema schema; | ||
private Schema spanSchema; | ||
|
||
private int positionIndex; // next position in the field to be checked. | ||
private int spanIndexValue; // Starting position of the matched dictionary | ||
|
||
private String documentValue; | ||
|
||
private String fieldName; | ||
private String queryValue; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "queryValue" -> "query"? |
||
private List<Attribute> attributeList; | ||
private ArrayList<String> queryValueArray; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "queryValueArray" -> "queryTokens"? |
||
private ITuple sourceTuple; | ||
private List<IField> fieldList; | ||
private boolean foundFlag; | ||
private boolean schemaDefined; | ||
|
||
|
||
public KeywordMatcher(IPredicate predicate, ISourceOperator sourceOperator) { | ||
this.predicate = (KeywordPredicate)predicate; | ||
this.sourceOperator = sourceOperator; | ||
} | ||
|
||
@Override | ||
public void open() throws DataFlowException { | ||
try { | ||
|
||
sourceOperator.open(); | ||
queryValue = predicate.getQuery(); | ||
attributeList = predicate.getAttributeList(); | ||
queryValueArray = predicate.getTokens(); | ||
patternList = new ArrayList<Pattern>(); | ||
for(String token : queryValueArray ){ | ||
regex = "\\b" + token.toLowerCase() + "\\b"; | ||
pattern = Pattern.compile(regex); | ||
patternList.add(pattern); | ||
} | ||
|
||
|
||
positionIndex = 0; | ||
foundFlag = false; | ||
schemaDefined = false; | ||
|
||
spanList = new ArrayList<>(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a very good idea that the "spanList" object is only created once in this operator. I think other operators (from other teams) should do the same thing. I will remind them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sandeepreddy602 Do you think your getNextTupe() can do the same optimization by creating spanList only once per operator (instead of per getNextTupe() call)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chenlica .. We are also doing the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great. Can you help me check if the queryrewriter is doing the same (in the master)? Other operators are not done yet. |
||
|
||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
throw new DataFlowException(e.getMessage(), e); | ||
} | ||
} | ||
|
||
@Override | ||
public ITuple getNextTuple() throws DataFlowException { | ||
try { | ||
sourceTuple = sourceOperator.getNextTuple(); | ||
if(sourceTuple == null){ | ||
return null; | ||
} | ||
if(!schemaDefined){ | ||
schemaDefined = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My IDE gave a warning that this "schemaDefined" is never used after the assignment. I think the comment is correct, since "schemaDefined" is defined as a local variable, and there is no loop to modify it. Is it a bug? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved "schemaDefined" wrongly to local scope during refactoring. Fixed it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Saw it. Agreed with the fix. |
||
fieldList = sourceTuple.getFields(); | ||
schema = sourceTuple.getSchema(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is calculating schema only once (ie assumes different tuples have same schema, which is also enforced with our LuceneSource.java design)
Getting schema in open() would require calling getNextTuple() there, which is not recommended.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code calculates schema only once (assumes same schema). For creating schema in open() function, we would have to call getNextTuple() in open to obtain a sourceTuple, which is not recommended. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Understood and agreed. Sorry I didn't see the schema is only created once (by using the "schemaDefined" flag). |
||
spanSchema = createSpanSchema(); | ||
} | ||
|
||
|
||
for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++){ | ||
IField field = sourceTuple.getField(attributeList.get(attributeIndex).getFieldName()); | ||
String fieldValue = (String) (field).getValue(); | ||
if(field instanceof StringField){ | ||
//Keyword should match fieldValue entirely | ||
|
||
if(fieldValue.equals(queryValue.toLowerCase())){ | ||
spanIndexValue = 0; | ||
positionIndex = queryValue.length(); | ||
addSpanToSpanList(fieldName, spanIndexValue, positionIndex, queryValue, fieldValue); | ||
foundFlag = true; | ||
|
||
} | ||
} | ||
else if(field instanceof TextField) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment to explain how we deal with a TextField. |
||
for (int iter = 0; iter < queryValueArray.size(); iter++) { | ||
String query = queryValueArray.get(iter); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "String query" -> "String token", right? |
||
Pattern p = patternList.get(iter); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Pattern p" -> "Pattern pattern" There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
matcher = p.matcher(fieldValue.toLowerCase()); | ||
while (matcher.find(positionIndex) != false) { | ||
spanIndexValue = matcher.start(); | ||
positionIndex = spanIndexValue + query.length(); | ||
documentValue = fieldValue.substring(spanIndexValue, positionIndex); | ||
addSpanToSpanList(fieldName, spanIndexValue, positionIndex, query, documentValue); | ||
foundFlag = true; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure if the "foundFlag" is used correctly. We return this tuple only if ALL the keywords appear in a given field. Not sure if this logic is implemented correctly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For keywordMatcher following design has been followed: Tuple is being returned even if one token is being matched in text field. For string field only exact phrase is matched. Its a design decision with no correct answer. Let me know if this is correct. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation is using the "OR" (disjunction) logic, i.e., as long as one token in the query string matches a keyword in a field, we create a list of spans for this field. I was actually considering using the "AND" (conjunction) logic, i.e., all the tokens in the query string have to appear in a field before we generate multiple spans for this field. Agreed that there is no correct answer. But I prefer the "AND" logic. If you agree, then a good implementation is to store all the spans for a field to a temperate spanList. Only after we find all query tokens in the field can we append these spans to the final spanList. @sandeepreddy602 : your thought on this subject? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sandeepreddy602 : take a look at this discussion. I believe we should implement the "AND" logic not "OR" logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chenlica .. I was of the opinion that, we should follow "OR" logic for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To finish this PR quickly, let's keep the current implementation ("OR" logic). We can refactor it later to make it configurable. @prakul : please take care of the new comments by @sandeepreddy602 . Please also add a comment to the code to say that we are doing the "OR" logic for now. After that, please do the merge. Thank you! |
||
|
||
} | ||
} | ||
} | ||
positionIndex = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this "positionIndex = 0" statement since it's never needed? We init it at the beginning of each "for" iteration? |
||
} | ||
|
||
//If all the 'attributes to be searched' have been processed return the result tuple with span info | ||
if (foundFlag){ | ||
foundFlag = false; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove "foundFlag = false" since it's no longer needed. |
||
positionIndex = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove "positionIndex = 0" since it's no longer used. |
||
return getSpanTuple(); | ||
} | ||
//Search next document if the required predicate did not match previous document | ||
else if(sourceTuple != null) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When we reach here, "sourceTuple" cannot be null, right? |
||
positionIndex = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove "positionIndex = 0" since it's no longer used. |
||
spanList.clear(); | ||
|
||
return getNextTuple(); | ||
|
||
} | ||
|
||
return null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we change the line |
||
|
||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
throw new DataFlowException(e.getMessage(), e); | ||
} | ||
|
||
} | ||
|
||
private ITuple getSpanTuple() { | ||
IField spanListField = new ListField<Span>(new ArrayList<>(spanList)); | ||
List<IField> fieldListDuplicate = new ArrayList<>(fieldList); | ||
fieldListDuplicate.add(spanListField); | ||
|
||
IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]); | ||
return new DataTuple(spanSchema, fieldsDuplicate); | ||
} | ||
|
||
private Schema createSpanSchema() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The Regex team is also writing a similar function. See https://github.com/TextDB/textdb/pull/84/files @laisycs @sandeepreddy602 : can you chime in to see how to avoid duplicate efforts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chenlica .. Will add these methods to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sandeepreddy602 : you can either add the methods in this PR, or use another PR and include the PR here. For the latter, @prakul will merge the new master and use these new methods. Either way is fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @chenlica .. Will handle these as part of this PR. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
List<Attribute> dataTupleAttributes = schema.getAttributes(); | ||
Attribute[] spanAttributes = new Attribute[dataTupleAttributes.size() + 1]; | ||
for (int count = 0; count < spanAttributes.length - 1; count++) { | ||
spanAttributes[count] = dataTupleAttributes.get(count); | ||
} | ||
spanAttributes[spanAttributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE; | ||
Schema spanSchema = new Schema(spanAttributes); | ||
return spanSchema; | ||
} | ||
|
||
private void addSpanToSpanList(String fieldName, int start, int end, String key, String value) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didnt find it. anyways Its a small function.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is there in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Not a big deal. |
||
Span span = new Span(fieldName, start, end, key, value); | ||
spanList.add(span); | ||
} | ||
|
||
|
||
@Override | ||
public void close() throws DataFlowException { | ||
try { | ||
sourceOperator.close(); | ||
} catch (Exception e) { | ||
e.printStackTrace(); | ||
throw new DataFlowException(e.getMessage(), e); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,4 +45,6 @@ public void close() throws DataFlowException { | |
throw new DataFlowException(e.getMessage(), e); | ||
} | ||
} | ||
|
||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove unused packages. Hopefully you can see them easily in your IDE.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks Professor @chenlica . Will keep in mind.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still see unused packages on the "import" list.