Texera · prakul · May 6, 2016 · Apr 24, 2016 · Apr 24, 2016 · Apr 24, 2016
diff --git a/...db/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/...db/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
@@ -0,0 +1,109 @@
+package edu.uci.ics.textdb.dataflow.common;
+
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.uci.ics.textdb.api.common.Attribute;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Query;
+
+import edu.uci.ics.textdb.api.common.IField;
+import edu.uci.ics.textdb.api.common.IPredicate;
+import edu.uci.ics.textdb.api.common.ITuple;
+import edu.uci.ics.textdb.common.exception.DataFlowException;
+import edu.uci.ics.textdb.common.field.StringField;
+
+/**
+ *  @author prakul
+ *  @author akshay
+ */
+public class KeywordPredicate implements IPredicate{
+
+    private final List<Attribute> attributeList;
+    private final String[] fields;
+    private final String query;
+    private final Query queryObject;
+    private ArrayList<String> tokens;
+    private Analyzer analyzer;
+
+    public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer analyzer ) throws DataFlowException{
+        try {
+            this.query = query;
+            this.attributeList = attributeList;
+            String[] temp = new String[attributeList.size()];
+
+            for (int i=0;i< attributeList.size();i++){
+                temp[i] = attributeList.get(i).getFieldName();
+            }
+            this.fields = temp;
+            this.tokens = queryTokenizer(analyzer, this.query);
+            this.analyzer = analyzer;
+            this.queryObject = createQueryObject();
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new DataFlowException(e.getMessage(), e);
+        }
+    }
+
+    @Override
+    public boolean satisfy(ITuple tuple) {
+        return true;
+    }
+
+
+
+    private Query createQueryObject() throws ParseException {
+        BooleanQuery booleanQuery = new BooleanQuery();
+        MultiFieldQueryParser parser = new MultiFieldQueryParser(this.fields, this.analyzer);
+        for(String searchToken: this.tokens){
+            Query termQuery = parser.parse(searchToken);
+            booleanQuery.add(termQuery, BooleanClause.Occur.MUST);
+        }
+        return booleanQuery;
+    }
+
+
+    public String getQuery(){
+        return query;
+    }
+
+    public List<Attribute> getAttributeList() {
+        return attributeList;
+    }
+    public Query getQueryObject(){return this.queryObject;}
+
+    public ArrayList<String> getTokens(){return this.tokens;}
+
+    public Analyzer getAnalyzer(){
+        return analyzer;
+    }
+
+    public ArrayList<String> queryTokenizer(Analyzer analyzer,  String query) {
+
+        ArrayList<String> result = new ArrayList<String>();
+        TokenStream tokenStream  = analyzer.tokenStream(null, new StringReader(query));
+        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
+
+        try{
+            tokenStream.reset();
+            while (tokenStream.incrementToken()) {
+                String term = charTermAttribute.toString();
+                result.add(term);
+            }
+            tokenStream.close();
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+
+        return result;
+    }
+}
diff --git a/...extdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java b/...extdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/keywordmatch/KeywordMatcher.java
@@ -0,0 +1,193 @@
+package edu.uci.ics.textdb.dataflow.keywordmatch;
+
+import edu.uci.ics.textdb.api.common.*;
+import edu.uci.ics.textdb.common.constants.SchemaConstants;
+import edu.uci.ics.textdb.api.common.Schema;
+import edu.uci.ics.textdb.common.field.Span;
+import edu.uci.ics.textdb.common.field.StringField;
+import edu.uci.ics.textdb.common.field.TextField;
+import edu.uci.ics.textdb.dataflow.common.KeywordPredicate;
+import org.apache.lucene.search.Query;
+import edu.uci.ics.textdb.common.field.ListField;
+import edu.uci.ics.textdb.common.field.DataTuple;
+
+import edu.uci.ics.textdb.api.dataflow.IOperator;
+import edu.uci.ics.textdb.api.dataflow.ISourceOperator;
+import edu.uci.ics.textdb.common.exception.DataFlowException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ *  @author prakul
+ *
+ */
+public class KeywordMatcher implements IOperator {
+    private final KeywordPredicate predicate;
+    private ISourceOperator sourceOperator;
+    private Query luceneQuery;
+
+    private String regex;
+    private Pattern pattern;
+    private ArrayList<Pattern> patternList;
+    private Matcher matcher;
+    private List<Span> spanList;
+    private Schema schema;
+    private Schema spanSchema;
+
+    private int positionIndex; // next position in the field to be checked.
+    private int spanIndexValue; // Starting position of the matched dictionary
+
+    private String documentValue;
+
+    private  String fieldName;
+    private  String queryValue;
+    private  List<Attribute> attributeList;
+    private  ArrayList<String> queryValueArray;
+    private ITuple sourceTuple;
+    private List<IField> fieldList;
+    private boolean foundFlag;
+    private boolean schemaDefined;
+
+
+    public KeywordMatcher(IPredicate predicate, ISourceOperator sourceOperator) {
+        this.predicate = (KeywordPredicate)predicate;
+        this.sourceOperator = sourceOperator;
+    }
+
+    @Override
+    public void open() throws DataFlowException {
+        try {
+
+            sourceOperator.open();
+            queryValue = predicate.getQuery();
+            attributeList = predicate.getAttributeList();
+            queryValueArray = predicate.getTokens();
+            patternList = new ArrayList<Pattern>();
+            for(String token : queryValueArray ){
+                regex = "\\b" + token.toLowerCase() + "\\b";
+                pattern = Pattern.compile(regex);
+                patternList.add(pattern);
+            }
+
+
+            positionIndex = 0;
+            foundFlag = false;
+            schemaDefined = false;
+
+            spanList = new ArrayList<>();
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new DataFlowException(e.getMessage(), e);
+        }
+    }
+
+    @Override
+    public ITuple getNextTuple() throws DataFlowException {
+        try {
+            sourceTuple = sourceOperator.getNextTuple();
+            if(sourceTuple == null){
+                return null;
+            }
+            if(!schemaDefined){
+                schemaDefined = true;
+                fieldList = sourceTuple.getFields();
+                schema = sourceTuple.getSchema();
+                spanSchema = createSpanSchema();
+            }
+
+
+            for(int attributeIndex = 0; attributeIndex < attributeList.size(); attributeIndex++){
+                IField field = sourceTuple.getField(attributeList.get(attributeIndex).getFieldName());
+                String fieldValue = (String) (field).getValue();
+                if(field instanceof StringField){
+                    //Keyword should match fieldValue entirely
+
+                    if(fieldValue.equals(queryValue.toLowerCase())){
+                        spanIndexValue = 0;
+                        positionIndex = queryValue.length();
+                        addSpanToSpanList(fieldName, spanIndexValue, positionIndex, queryValue, fieldValue);
+                        foundFlag = true;
+
+                    }
+                }
+                else if(field instanceof TextField) {
+                    for (int iter = 0; iter < queryValueArray.size(); iter++) {
+                        String query = queryValueArray.get(iter);
+                        Pattern p = patternList.get(iter);
+                        matcher = p.matcher(fieldValue.toLowerCase());
+                        while (matcher.find(positionIndex) != false) {
+                            spanIndexValue = matcher.start();
+                            positionIndex = spanIndexValue + query.length();
+                            documentValue = fieldValue.substring(spanIndexValue, positionIndex);
+                            addSpanToSpanList(fieldName, spanIndexValue, positionIndex, query, documentValue);
+                            foundFlag = true;
+
+                        }
+                    }
+                }
+                positionIndex = 0;
+            }
+
+            //If all the 'attributes to be searched' have been processed return the result tuple with span info
+            if (foundFlag){
+                foundFlag = false;
+                positionIndex = 0;
+                return getSpanTuple();
+            }
+            //Search next document if the required predicate did not match previous document
+            else if(sourceTuple != null) {
+                positionIndex = 0;
+                spanList.clear();
+
+                return getNextTuple();
+
+            }
+
+            return null;
+
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new DataFlowException(e.getMessage(), e);
+        }
+
+    }
+
+    private ITuple getSpanTuple() {
+        IField spanListField = new ListField<Span>(new ArrayList<>(spanList));
+        List<IField> fieldListDuplicate = new ArrayList<>(fieldList);
+        fieldListDuplicate.add(spanListField);
+
+        IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]);
+        return new DataTuple(spanSchema, fieldsDuplicate);
+    }
+
+    private Schema createSpanSchema() {
+        List<Attribute> dataTupleAttributes = schema.getAttributes();
+        Attribute[] spanAttributes = new Attribute[dataTupleAttributes.size() + 1];
+        for (int count = 0; count < spanAttributes.length - 1; count++) {
+            spanAttributes[count] = dataTupleAttributes.get(count);
+        }
+        spanAttributes[spanAttributes.length - 1] = SchemaConstants.SPAN_LIST_ATTRIBUTE;
+        Schema spanSchema = new Schema(spanAttributes);
+        return spanSchema;
+    }
+
+    private void addSpanToSpanList(String fieldName, int start, int end, String key, String value) {
+        Span span = new Span(fieldName, start, end, key, value);
+        spanList.add(span);
+    }
+
+
+    @Override
+    public void close() throws DataFlowException {
+        try {
+            sourceOperator.close();
+        } catch (Exception e) {
+            e.printStackTrace();
+            throw new DataFlowException(e.getMessage(), e);
+        }
+    }
+}
diff --git a/...-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/source/IndexSearchSourceOperator.java b/...-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/source/IndexSearchSourceOperator.java
@@ -45,4 +45,6 @@ public void close() throws DataFlowException {
 			throw new DataFlowException(e.getMessage(), e);
 		}
 	}
+
+
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -45,4 +45,6 @@ public void close() throws DataFlowException { @@
     			throw new DataFlowException(e.getMessage(), e);
     		}
     	}
     }