Texera · prakul · May 6, 2016 · Apr 24, 2016 · Apr 24, 2016 · Apr 24, 2016
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
@@ -1,19 +1,32 @@
 package edu.uci.ics.textdb.common.utils;
 
+import java.io.StringReader;
 import java.text.ParseException;
+import java.util.ArrayList;
 import java.util.Date;
+import java.util.HashSet;
+import java.util.List;
 
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.DateTools;
 import org.apache.lucene.document.DateTools.Resolution;
 import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.document.IntField;
 import org.apache.lucene.index.IndexableField;
 
+import edu.uci.ics.textdb.api.common.Attribute;
 import edu.uci.ics.textdb.api.common.FieldType;
 import edu.uci.ics.textdb.api.common.IField;
+import edu.uci.ics.textdb.api.common.ITuple;
+import edu.uci.ics.textdb.api.common.Schema;
+import edu.uci.ics.textdb.common.constants.SchemaConstants;
+import edu.uci.ics.textdb.common.field.DataTuple;
 import edu.uci.ics.textdb.common.field.DateField;
 import edu.uci.ics.textdb.common.field.DoubleField;
 import edu.uci.ics.textdb.common.field.IntegerField;
+import edu.uci.ics.textdb.common.field.ListField;
+import edu.uci.ics.textdb.common.field.Span;
 import edu.uci.ics.textdb.common.field.StringField;
 import edu.uci.ics.textdb.common.field.TextField;
 
@@ -72,4 +85,60 @@ public static IndexableField getLuceneField(FieldType fieldType,
         }
         return luceneField;
     }
+    /**
+     * @about Modifies schema, fields and creates a new span tuple
+     */
+    public static ITuple getSpanTuple( List<IField> fieldList, List<Span> spanList, Schema spanSchema) {
+        IField spanListField = new ListField<Span>(new ArrayList<>(spanList));
+        List<IField> fieldListDuplicate = new ArrayList<>(fieldList);
+        fieldListDuplicate.add(spanListField);
+
+        IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]);
+        return new DataTuple(spanSchema, fieldsDuplicate);
+    }
+
+    /**
+     * 
+     * @param schema 
+     * @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to
+     *        the schema. SPAN_LIST_ATTRIBUTE is of type List
+     */
+    public static Schema createSpanSchema(Schema schema) {
+        List<Attribute> dataTupleAttributes = schema.getAttributes();
+        //spanAttributes contains all attributes of dataTupleAttributes and an additional SPAN_LIST_ATTRIBUTE
+        Attribute[] spanAttributes = new Attribute[dataTupleAttributes.size() + 1];
+        for (int count = 0; count < dataTupleAttributes.size(); count++) {
+            spanAttributes[count] = dataTupleAttributes.get(count);
+        }
+        spanAttributes[spanAttributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
+        Schema spanSchema = new Schema(spanAttributes);
+        return spanSchema;
+    }
+
+    /**
+     * Tokenizes the query string using the given analyser
+     * @param analyzer
+     * @param query
+     * @return ArrayList<String> list of results
+     */
+    public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) {
+        HashSet<String> resultSet = new HashSet<>();
+        ArrayList<String> result = new ArrayList<String>();
+        TokenStream tokenStream  = analyzer.tokenStream(null, new StringReader(query));
+        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
+
+        try{
+            tokenStream.reset();
+            while (tokenStream.incrementToken()) {
+                String term = charTermAttribute.toString();
+                resultSet.add(term);
+            }
+            tokenStream.close();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        result.addAll(resultSet);
+
+        return result;
+    }
 }
diff --git a/...db/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/...db/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
@@ -0,0 +1,97 @@
+package edu.uci.ics.textdb.dataflow.common;
+
+import java.util.ArrayList;
+import java.util.List;
+import edu.uci.ics.textdb.api.common.Attribute;
+import edu.uci.ics.textdb.common.utils.Utils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+import edu.uci.ics.textdb.api.common.IPredicate;
+import edu.uci.ics.textdb.api.common.ITuple;
+import edu.uci.ics.textdb.common.exception.DataFlowException;
+
+/**
+ *  @author prakul
+ *
+ */
+
+/**
+ * This class handles creation of predicate for querying using Keyword Matcher
+ */
+public class KeywordPredicate implements IPredicate{
+
+    private final List<Attribute> attributeList;
+    private final String[] fields;
+    private final String query;
+    private final Query queryObject;
+    private ArrayList<String> tokens;
+    private Analyzer analyzer;
+
+    public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer analyzer ) throws DataFlowException{
+        try {
+            this.query = query;
+            this.attributeList = attributeList;
+            String[] temp = new String[attributeList.size()];
+
+            for(int i=0; i < attributeList.size(); i++){
+                temp[i] = attributeList.get(i).getFieldName();
+            }
+            this.fields = temp;
+            this.tokens = Utils.tokenizeQuery(analyzer, this.query);
+            this.analyzer = analyzer;
+            this.queryObject = createQueryObject();
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new DataFlowException(e.getMessage(), e);
+        }
+    }
+
+    @Override
+    public boolean satisfy(ITuple tuple) {
+
+        //This method is necessary for the interface implementation, and it's really not used.
+        return true;
+    }
+
+    /**
+     * Creates a Query object as a boolean Query on all attributes.
+     * Example: For creating a query like
+     * (TestConstants.DESCRIPTION + ":lin" + " AND " + TestConstants.LAST_NAME + ":lin")
+     * we provide a list of AttributeFields (Description, Last_name) to search on and a query string (lin)
+     *
+     * TODO #88:BooleanQuery() is deprecated. In future a better solution could be worked out in Query builder layer
+
+     * @return QueryObject
+     * @throws ParseException
+     */
+    private Query createQueryObject() throws ParseException {
+        BooleanQuery booleanQuery = new BooleanQuery();
+        MultiFieldQueryParser parser = new MultiFieldQueryParser(this.fields, this.analyzer);
+        for(String searchToken: this.tokens){
+            Query termQuery = parser.parse(searchToken);
+            booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
+        }
+        return booleanQuery;
+    }
+
+    public String getQuery(){
+        return query;
+    }
+
+    public List<Attribute> getAttributeList() {
+        return attributeList;
+    }
+    public Query getQueryObject(){return this.queryObject;}
+
+    public ArrayList<String> getTokens(){return this.tokens;}
+
+    public Analyzer getAnalyzer(){
+        return analyzer;
+    }
+
+
+}
diff --git a/...taflow/src/main/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcher.java b/...taflow/src/main/java/edu/uci/ics/textdb/dataflow/dictionarymatcher/DictionaryMatcher.java
@@ -12,13 +12,11 @@
 import edu.uci.ics.textdb.api.common.ITuple;
 import edu.uci.ics.textdb.api.common.Schema;
 import edu.uci.ics.textdb.api.dataflow.IOperator;
-import edu.uci.ics.textdb.common.constants.SchemaConstants;
 import edu.uci.ics.textdb.common.exception.DataFlowException;
-import edu.uci.ics.textdb.common.field.DataTuple;
-import edu.uci.ics.textdb.common.field.ListField;
 import edu.uci.ics.textdb.common.field.Span;
 import edu.uci.ics.textdb.common.field.StringField;
 import edu.uci.ics.textdb.common.field.TextField;
+import edu.uci.ics.textdb.common.utils.Utils;
 
 /**
  * @author Sudeep [inkudo]
@@ -36,7 +34,6 @@ public class DictionaryMatcher implements IOperator {
     private String spanFieldName;
     private ITuple dataTuple;
     private List<IField> fields;
-    private Schema schema;
     private Schema spanSchema;
 
     private String regex;
@@ -79,8 +76,9 @@ public void open() throws Exception {
 
             dataTuple = operator.getNextTuple();
             fields = dataTuple.getFields();
-            schema = dataTuple.getSchema();
-            spanSchema = createSpanSchema();
+            if(spanSchema == null){
+                spanSchema = Utils.createSpanSchema(dataTuple.getSchema());
+            }
 
             spanList = new ArrayList<>();
             isPresent = false;
@@ -91,22 +89,6 @@ public void open() throws Exception {
         }
     }
 
-    /**
-     * 
-     * @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to
-     *        the schema. SPAN_LIST_ATTRIBUTE is of type List
-     */
-    private Schema createSpanSchema() {
-        List<Attribute> dataTupleAttributes = schema.getAttributes();
-        Attribute[] spanAttributes = new Attribute[dataTupleAttributes.size() + 1];
-        for (int count = 0; count < spanAttributes.length - 1; count++) {
-            spanAttributes[count] = dataTupleAttributes.get(count);
-        }
-        spanAttributes[spanAttributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
-        Schema spanSchema = new Schema(spanAttributes);
-        return spanSchema;
-    }
-
     /**
      * @about Gets next matched tuple. Returns a new span tuple including the
      *        span results. Performs a scan based search, gets the dictionary
@@ -168,7 +150,7 @@ public ITuple getNextTuple() throws Exception {
         } else if (attributeIndex == searchInAttributes.size() && isPresent) {
             isPresent = false;
             positionIndex = 0;
-            return getSpanTuple();
+            return Utils.getSpanTuple(fields, spanList, spanSchema);
 
         } else if ((dataTuple = operator.getNextTuple()) != null) {
             // Get the next document
@@ -177,8 +159,6 @@ public ITuple getNextTuple() throws Exception {
             spanList.clear();
 
             fields = dataTuple.getFields();
-            schema = dataTuple.getSchema();
-            spanSchema = createSpanSchema();
             return getNextTuple();
 
         } else if ((dictionaryValue = dictionary.getNextValue()) != null) {
@@ -197,7 +177,6 @@ public ITuple getNextTuple() throws Exception {
 
             dataTuple = operator.getNextTuple();
             fields = dataTuple.getFields();
-            schema = dataTuple.getSchema();
             return getNextTuple();
         }
 
@@ -209,18 +188,6 @@ private void addSpanToSpanList(String fieldName, int start, int end, String key,
         spanList.add(span);
     }
 
-    /**
-     * @about Modifies schema, fields and creates a new span tuple
-     */
-    private ITuple getSpanTuple() {
-        IField spanListField = new ListField<Span>(new ArrayList<>(spanList));
-        List<IField> fieldListDuplicate = new ArrayList<>(fields);
-        fieldListDuplicate.add(spanListField);
-
-        IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]);
-        return new DataTuple(spanSchema, fieldsDuplicate);
-    }
-
     /**
      * @about Closes the operator
      */