Texera · akshaybetala · May 18, 2016 · May 17, 2016 · May 17, 2016 · May 17, 2016
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/ListField.java
@@ -38,7 +38,7 @@ public boolean equals(Object obj) {
         if (list == null) {
             if (other.list != null)
                 return false;
-        } else if (!list.equals(other.list))
+        } else if ( !(list.containsAll(other.list) & other.list.containsAll(list)))
             return false;
         return true;
     }

diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/field/Span.java
@@ -3,22 +3,41 @@
 public class Span {
     //The name of the field (in the tuple) where this span is present
     private String fieldName;
-    //The start of the span
+    //The start position of the span, which is the offset of the gap before the first character of the span.
     private int start;
-    //The end of the span
+    //The end position of the span, which is the offset of the gap after the last character of the span.
     private int end;
     //The key we are searching for eg: regex
     private String key;
     //The value matching the key
     private String value;
-
-
-    public Span(String fieldName, int start, int end, String key, String value) {
+    // The token position of the span, starting from 0.
+    private int tokenOffset;
+
+    /*
+    Example:
+        Value = "The quick brown fox jumps over the lazy dog"
+        Now the Span for brown should be
+        start = 10 : index Of character 'b'
+        end = 15 :  index of character 'n'+ 1 OR start+length
+                Both of then result in same values.
+        tokenOffset = 2 position of word 'brown'
+     */
+
+    public static int INVALID_TOKEN_OFFSET = -1;
+
+    public Span(String fieldName, int start, int end, String key, String value){
         this.fieldName = fieldName;
         this.start = start;
         this.end = end;
         this.key = key;
         this.value = value;
+        this.tokenOffset = INVALID_TOKEN_OFFSET;
+    }
+
+    public Span(String fieldName, int start, int end, String key, String value, int tokenOffset) {
+        this(fieldName, start, end, key, value);
+        this.tokenOffset = tokenOffset;
     }
 
     public String getFieldName() {
@@ -41,6 +60,8 @@ public int getEnd() {
         return end;
     }
 
+    public  int getTokenOffset(){return tokenOffset;}
+
     @Override
     public int hashCode() {
         final int prime = 31;
@@ -51,6 +72,7 @@ public int hashCode() {
         result = prime * result + ((key == null) ? 0 : key.hashCode());
         result = prime * result + start;
         result = prime * result + ((value == null) ? 0 : value.hashCode());
+        result = prime * result + tokenOffset;
         return result;
     }
 
@@ -87,7 +109,10 @@ public boolean equals(Object obj) {
                 return false;
         } else if (!value.equals(other.value))
             return false;
-
+
+        if(tokenOffset!= other.tokenOffset)
+            return false;
+
         return true;
     }
 }
diff --git a/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java b/textdb/textdb-common/src/main/java/edu/uci/ics/textdb/common/utils/Utils.java
@@ -14,6 +14,7 @@
 import org.apache.lucene.document.DateTools.Resolution;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexOptions;
 
 import edu.uci.ics.textdb.api.common.Attribute;
 import edu.uci.ics.textdb.api.common.FieldType;
@@ -49,18 +50,18 @@ public static IField getField(FieldType fieldType, String fieldValue) throws Par
             case TEXT:
                 field = new TextField(fieldValue);
                 break;
-            
+
             default:
                 break;
         }
         return field;
     }
 
     public static IndexableField getLuceneField(FieldType fieldType,
-            String fieldName, Object fieldValue) {
+             String fieldName, Object fieldValue) {
         IndexableField luceneField = null;
         switch(fieldType){
-	        case STRING:
+            case STRING:
                 luceneField = new org.apache.lucene.document.StringField(
                         fieldName, (String) fieldValue, Store.YES);
                 break;
@@ -78,10 +79,22 @@ public static IndexableField getLuceneField(FieldType fieldType,
                 luceneField = new org.apache.lucene.document.StringField(fieldName, dateString, Store.YES);
                 break;
             case TEXT:
-	            luceneField = new org.apache.lucene.document.TextField(
-	                    fieldName, (String) fieldValue, Store.YES);
-	            break;
-
+                //By default we enable positional indexing in Lucene so that we can return
+                // information about character offsets and token offsets
+                org.apache.lucene.document.FieldType luceneFieldType = new org.apache.lucene.document.FieldType();
+                luceneFieldType.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS );
+                luceneFieldType.setStored(true);
+                luceneFieldType.setStoreTermVectors( true );
+                luceneFieldType.setStoreTermVectorOffsets( true );
+                luceneFieldType.setStoreTermVectorPayloads( true );
+                luceneFieldType.setStoreTermVectorPositions( true );
+                luceneFieldType.setTokenized( true );
+
+                luceneField = new org.apache.lucene.document.Field(
+                        fieldName,(String) fieldValue,luceneFieldType);
+
+                break;
+
         }
         return luceneField;
     }
@@ -96,10 +109,10 @@ public static ITuple getSpanTuple( List<IField> fieldList, List<Span> spanList,
         IField[] fieldsDuplicate = fieldListDuplicate.toArray(new IField[fieldListDuplicate.size()]);
         return new DataTuple(spanSchema, fieldsDuplicate);
     }
-    
+
     /**
-     * 
-     * @param schema 
+     *
+     * @param schema
      * @about Creating a new schema object, and adding SPAN_LIST_ATTRIBUTE to
      *        the schema. SPAN_LIST_ATTRIBUTE is of type List
      */
@@ -117,21 +130,25 @@ public static Schema createSpanSchema(Schema schema) {
 
     /**
      * Tokenizes the query string using the given analyser
-     * @param analyzer
+     * @param luceneAnalyzer
      * @param query
      * @return ArrayList<String> list of results
      */
-    public static ArrayList<String> tokenizeQuery(Analyzer analyzer, String query) {
+    public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) {
         HashSet<String> resultSet = new HashSet<>();
         ArrayList<String> result = new ArrayList<String>();
-        TokenStream tokenStream  = analyzer.tokenStream(null, new StringReader(query));
+        TokenStream tokenStream  = luceneAnalyzer.tokenStream(null, new StringReader(query));
         CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
 
         try{
             tokenStream.reset();
             while (tokenStream.incrementToken()) {
-                String term = charTermAttribute.toString();
-                resultSet.add(term);
+                String token = charTermAttribute.toString();
+                int tokenIndex = query.toLowerCase().indexOf(token);
+                // Since tokens are converted to lower case,
+                // get the exact token from the query string.
+                String actualQueryToken = query.substring(tokenIndex, tokenIndex+token.length());
+                resultSet.add(actualQueryToken);
             }
             tokenStream.close();
         } catch (Exception e) {

diff --git a/...textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java b/...textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/DictionaryPredicate.java
@@ -34,11 +34,11 @@ public class DictionaryPredicate implements IPredicate {
     New and York; if searched in String field we search for Exact string.
      */
 
-    public DictionaryPredicate(IDictionary dictionary, Analyzer analyzer, List<Attribute> attributeList,
+    public DictionaryPredicate(IDictionary dictionary, Analyzer luceneAnalyzer, List<Attribute> attributeList,
             SourceOperatorType srcOpType, IDataStore dataStore) {
 
         this.dictionary = dictionary;
-        this.luceneAnalyzer = analyzer;
+        this.luceneAnalyzer = luceneAnalyzer;
         this.attributeList = attributeList;
         this.srcOpType = srcOpType;
         this.dataStore = dataStore;
@@ -72,7 +72,7 @@ public Analyzer getAnalyzer() {
     public IOperator getScanSourceOperator() throws ParseException, DataFlowException {
         QueryParser luceneQueryParser = new QueryParser(attributeList.get(0).getFieldName(), luceneAnalyzer);
         Query luceneQuery = luceneQueryParser.parse(DataConstants.SCAN_QUERY);
-        IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery);
+        IPredicate dataReaderPredicate = new DataReaderPredicate(dataStore, luceneQuery,DataConstants.SCAN_QUERY,luceneAnalyzer,attributeList);
         IDataReader dataReader = new DataReader(dataReaderPredicate);
 
         IOperator operator = new ScanBasedSourceOperator(dataReader);

diff --git a/...db/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java b/...db/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/KeywordPredicate.java
@@ -34,18 +34,18 @@ public class KeywordPredicate implements IPredicate{
     private final String query;
     private final Query luceneQuery;
     private ArrayList<String> tokens;
-    private Analyzer analyzer;
+    private Analyzer luceneAnalyzer;
     private IDataStore dataStore;
 
     /*
     query refers to string of keywords to search for.
     For Ex. New york if searched in TextField, we would consider both tokens
     New and York; if searched in String field we search for Exact string.
      */
-    public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer analyzer,IDataStore dataStore ) throws DataFlowException{
+    public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer luceneAnalyzer, IDataStore dataStore ) throws DataFlowException{
         try {
             this.query = query;
-            this.tokens = Utils.tokenizeQuery(analyzer, query);
+            this.tokens = Utils.tokenizeQuery(luceneAnalyzer, query);
             this.attributeList = attributeList;
             this.dataStore = dataStore;
             String[] temp = new String[attributeList.size()];
@@ -54,7 +54,7 @@ public KeywordPredicate(String query, List<Attribute> attributeList, Analyzer an
                 temp[i] = attributeList.get(i).getFieldName();
             }
             this.fields = temp;
-            this.analyzer = analyzer;
+            this.luceneAnalyzer = luceneAnalyzer;
             this.luceneQuery = createLuceneQueryObject();
         } catch (Exception e) {
             e.printStackTrace();
@@ -105,7 +105,7 @@ and generate  boolean query (Textfield is Case Insensitive)
          */
         String[] remainingTextFields = (String[]) textFieldList.toArray(new String[0]);
         BooleanQuery queryOnTextFields = new BooleanQuery();
-        MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, analyzer);
+        MultiFieldQueryParser parser = new MultiFieldQueryParser(remainingTextFields, luceneAnalyzer);
 
         for(String searchToken : this.tokens){
             Query termQuery = parser.parse(searchToken);
@@ -126,16 +126,18 @@ public String getQuery(){
     public List<Attribute> getAttributeList() {
         return attributeList;
     }
+
     public Query getQueryObject(){return this.luceneQuery;}
 
     public ArrayList<String> getTokens(){return this.tokens;}
 
-    public Analyzer getAnalyzer(){
-        return analyzer;
+    public Analyzer getLuceneAnalyzer(){
+        return luceneAnalyzer;
     }
 
     public DataReaderPredicate getDataReaderPredicate() {
-        DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery);
+        DataReaderPredicate dataReaderPredicate = new DataReaderPredicate(this.dataStore, this.luceneQuery,
+                this.query, this.luceneAnalyzer, this.attributeList);
         return dataReaderPredicate;
     }
 

diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/common/RegexPredicate.java
@@ -23,14 +23,15 @@ public class RegexPredicate implements IPredicate {
 
 	private String regex;
 	private List<String> fieldNameList;
-
+	private List<Attribute> attributeList;
 	private Analyzer luceneAnalyzer;
 	private IDataStore dataStore;
 
 	public RegexPredicate(String regex, List<Attribute> attributeList, Analyzer analyzer, IDataStore dataStore) {
 		this.regex = regex;
 		this.luceneAnalyzer = analyzer;
 		this.dataStore = dataStore;
+		this.attributeList = attributeList;
 		this.fieldNameList = attributeList.stream()
 				.filter(attr -> (attr.getFieldType() == FieldType.TEXT || attr.getFieldType() == FieldType.STRING))
 				.map(attr -> attr.getFieldName()).collect(Collectors.toList());
@@ -52,4 +53,8 @@ public List<String> getFieldNameList() {
 		return this.fieldNameList;
 	}
 
+	public List<Attribute> getAttributeList() {
+		return attributeList;
+	}
+
 }