Merge pull request #100 from TextDB/team3-regex-trigram

[Issue #30 #99] Team3 regex trigram
Texera · May 17, 2016 · fc894b7 · fc894b7
2 parents d464d88 + 5d957c7
commit fc894b7
Show file tree

Hide file tree

Showing 5 changed files with 526 additions and 0 deletions.
diff --git a/...extdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/...extdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java
@@ -0,0 +1,181 @@
+package edu.uci.ics.textdb.dataflow.regexmatch;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.StringJoiner;
+
+import edu.uci.ics.textdb.common.constants.DataConstants;
+
+
+class GramBooleanQuery {
+	enum QueryOp {
+		NONE, // doesn't match any string
+		ANY,  // matches any string
+
+		AND,
+		OR
+	}
+	QueryOp operator;
+	List<String> operandList;
+	List<GramBooleanQuery> subQueryList;
+
+	int gramLength;
+
+	/**
+	 * Constructs a GramBooleanQuery with default gram length 3. <br>
+	 * @param operator
+	 */
+	GramBooleanQuery(QueryOp operator) {
+		this(operator, 3);
+	}
+
+	GramBooleanQuery(QueryOp operator, int gramLength) {
+		this.operator = operator;
+		operandList = new ArrayList<String>();
+		subQueryList = new ArrayList<GramBooleanQuery>();
+		this.gramLength = gramLength;
+	}
+
+	/**
+	 * This returns a GramBooleanQuery's hash code. <br>
+	 * It won't traverse the whole tree, instead, 
+	 * it only calculates the hashcode of direct leafs. <br>
+	 * 
+	 */
+	@Override
+	public int hashCode() {
+		int hashCode = operator.toString().hashCode();
+		for (String s : operandList) {
+			hashCode = hashCode ^ s.hashCode();
+		}
+		return hashCode;
+	}
+
+	/**
+	 * This overrides "equals" function. Whenever a GramBooleanQUery 
+	 * object is compared to another object, this function will be called. <br>
+	 * It recursively traverses the query tree and compares 
+	 * the set of sub-queries (order doesn't matter). <br>
+	 * It internally uses a HashSet to compare sub-queries. <br>
+	 */
+	@Override
+	public boolean equals(Object compareTo) {
+		if (! (compareTo instanceof GramBooleanQuery)) {
+			return false;
+		}
+
+		GramBooleanQuery query = (GramBooleanQuery) compareTo;
+		if (this.operator != query.operator
+			|| this.operandList.size() != query.operandList.size()
+			|| this.subQueryList.size() != query.subQueryList.size()) {
+			return false;
+		}
+
+		Set<String> operandSet = new HashSet<String>(this.operandList);
+		if (!operandSet.equals(new HashSet<String>(query.operandList))) {
+			return false;
+		}
+
+		Set<GramBooleanQuery> subQuerySet = new HashSet<GramBooleanQuery>(this.subQueryList);
+		if (!subQuerySet.equals(new HashSet<GramBooleanQuery>(query.subQueryList))) {
+			return false;
+		}
+
+		return true;
+	}
+
+	/**
+	 * This method takes a list of strings and adds them to the query tree. <br>
+	 * For example, if the list is {abcd, wxyz}, then: <br>
+	 * trigrams({abcd, wxyz}) = trigrams(abcd) OR trigrams(wxyz) <br>
+	 * OR operator is assumed for a list of strings. <br>
+	 * @param list, a list of strings to be added into query.
+	 */
+	void add(List<String> list) {
+		addOrNode(list);
+	}
+
+	private void addOrNode(List<String> literalList) {
+		GramBooleanQuery query = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR);
+		for (String literal : literalList) {
+			query.addAndNode(literal);
+		}
+		this.subQueryList.add(query);
+	}
+
+	/**
+	 * This method takes a single string and adds it to the query tree. <br>
+	 * The string is converted to multiple n-grams with an AND operator. <br>
+	 * For example: if the string is abcd, then: <br>
+	 * trigrams(abcd) = abc AND bcd <br>
+	 * AND operator is assumed for a single string. <br>
+	 * @param literal
+	 */
+	private void addAndNode(String literal) {
+		GramBooleanQuery query = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND);
+		for (String nGram: literalToNGram(literal)) {
+			query.operandList.add(nGram);
+		}
+		this.subQueryList.add(query);
+	}
+
+	/**
+	 * This function builds a list of N-Grams that a given literal contains. <br>
+	 * If the length of the literal is smaller than N, it returns an empty list. <br>
+	 * For example, for literal "textdb", its tri-gram list should be ["tex", "ext", "xtd", "tdb"]
+	 * @param literal
+	 * @return
+	 */
+	private List<String> literalToNGram(String literal) {
+		ArrayList<String> nGrams = new ArrayList<>();
+		if (literal.length() >= gramLength) {
+			for (int i = 0; i <= literal.length()-gramLength; ++i) {
+				nGrams.add(literal.substring(i, i+gramLength));
+			}
+		}
+		return nGrams;
+	}
+
+	/**
+	 * @return boolean expression 
+	 */
+	public String toString() {
+		return this.getLuceneQueryString();
+	}
+
+	/**
+	 * This function recursively connects 
+	 *   operand in {@code operandList} and subqueries in {@code subqueryList} 
+	 *   with {@code operator}. <br>
+	 * It generates a string representing the query that can be directly parsed by Lucene.
+	 * @return boolean expression
+	 */
+	String getLuceneQueryString() {
+		if (operator == QueryOp.ANY) {
+			return DataConstants.SCAN_QUERY;
+		} else if (operator == QueryOp.NONE) {
+			return "";
+		} else {
+			StringJoiner joiner =  new StringJoiner(
+					(operator == QueryOp.AND) ? " AND " : " OR ");
+			for (String operand : operandList) {
+				joiner.add(operand);
+			}
+			for (GramBooleanQuery subQuery : subQueryList) {
+				String subQueryStr = subQuery.getLuceneQueryString();
+				if (! subQueryStr.equals("")) 
+					joiner.add(subQueryStr);
+			}
+
+			if (joiner.length() == 0) {
+				return "";
+			} else {
+				return "("+joiner.toString()+")";
+			}
+		}
+	}
+
+
+}
diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexInfo.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexInfo.java
@@ -0,0 +1,82 @@
+package edu.uci.ics.textdb.dataflow.regexmatch;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * @Author Zuozhi Wang
+ * @Author Shuying Lai
+ * 
+ * RegexInfo for translating regex to an n-gram boolean query. <br>
+ * see <a href='https://swtch.com/~rsc/regexp/regexp4.html'>https://swtch.com/~rsc/regexp/regexp4.html</a> for details. <br>
+ */
+class RegexInfo {
+	boolean emptyable;
+	List<String> exact = null;
+	List<String> prefix = null;
+	List<String> suffix = null;
+	GramBooleanQuery match = null;
+
+	/**
+	 * This initializes RegexInfo:
+	 * emptyable to false
+	 * exact, prefix, suffix to empty arraylist
+	 * match to match ALL
+	 */
+	RegexInfo() {
+		emptyable = false;
+		exact = new ArrayList<String>();
+		prefix = new ArrayList<String>();
+		suffix = new ArrayList<String>();
+		match = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY);
+	}
+
+	/**
+	 * @return RegexInfo describing a regex that matches NO string
+	 * This function shouldn't be called unless something goes wrong.
+	 * It is used to handle error cases.
+	 */
+	static RegexInfo matchNone() {
+		RegexInfo regexInfo = new RegexInfo();
+		regexInfo.match.operator = GramBooleanQuery.QueryOp.NONE;
+		return regexInfo;
+	}
+
+	/**
+	 * 
+	 * @return RegexInfo describing a regex that matches ANY string
+	 */
+	static RegexInfo matchAny() {
+		RegexInfo regexInfo = new RegexInfo();
+		regexInfo.emptyable = true;
+		regexInfo.prefix.add("");
+		regexInfo.suffix.add("");
+		regexInfo.match.operator = GramBooleanQuery.QueryOp.ANY;
+		return regexInfo;
+	}
+
+	/**
+	 * 
+	 * @return RegexInfo describing a regex that matches an EMPTY string
+	 */
+	static RegexInfo emptyString() {
+
+		RegexInfo regexInfo = new RegexInfo();
+		regexInfo.emptyable = true;
+		regexInfo.match.operator = GramBooleanQuery.QueryOp.ANY;
+		regexInfo.exact.add("");
+		return regexInfo;
+	}
+
+	/** 
+	 * @return RegexInfo describing a regex that matches ANY SINGLE character
+	 * For anyChar, prefix, suffix, and exact are null (unknown), 
+	 * because we don't know the exact character.
+	 */
+	static RegexInfo anyChar() {
+		RegexInfo regexInfo = new RegexInfo();
+		regexInfo.emptyable = false;
+		return regexInfo;
+	}
+
+}
diff --git a/...flow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslator.java b/...flow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslator.java
@@ -0,0 +1,124 @@
+package edu.uci.ics.textdb.dataflow.regexmatch;
+
+import com.google.re2j.PublicParser;
+import com.google.re2j.PublicRE2;
+import com.google.re2j.PublicRegexp;
+import com.google.re2j.PublicSimplify;
+
+/**
+ * This class translates a regex to a boolean query of n-grams,
+ * according to the <a href='https://swtch.com/~rsc/regexp/regexp4.html'>algorithm</a> 
+ * described in Russ Cox's article. <br>
+ * 
+ * @Author Zuozhi Wang
+ * @Author Shuying Lai
+ * 
+ */
+public class RegexToGramQueryTranslator {	
+
+	/**
+	 * This method translates a regular expression to 
+	 * a boolean expression of n-grams. <br>
+	 * Then the boolean expression can be queried using 
+	 * an n-gram inverted index to speed up regex matching. <br>
+	 * 
+	 * @param regex, the regex string to be translated.
+	 * @return GamBooleanQeruy, a boolean query of n-grams.
+	 */
+	public static GramBooleanQuery translate(String regex) {
+		// try to parse using RE2J
+		try {
+		    PublicRegexp re = PublicParser.parse(regex, PublicRE2.PERL);
+		    re = PublicSimplify.simplify(re);
+		    RegexInfo regexInfo = analyze(re);
+		    return regexInfo.match;
+		    // if RE2J parsing fails
+		} catch (com.google.re2j.PatternSyntaxException re2j_e) {
+			// try to parse using Java Regex
+			// if succeeds, return matchAll (scan based)
+			try {
+				java.util.regex.Pattern.compile(regex);
+				return RegexInfo.matchAny().match;
+			// if Java Regex fails too, return matchNone (not a regex)
+			} catch (java.util.regex.PatternSyntaxException java_e) {
+				return RegexInfo.matchNone().match;
+			}
+		}
+	}
+
+
+	/**
+	 * This is the main function of analyzing a regular expression. <br>
+	 * This methods walks through the regex abstract syntax tree generated by RE2J, 
+	 * and 
+	 * 
+	 * @param PublicRegexp
+	 * @return RegexInfo
+	 */
+	private static RegexInfo analyze(PublicRegexp re) {
+
+		switch (re.getOp()) {
+		// NO_MATCH is a regex that doesn't match anything.
+		// It's used to handle error cases, which shouldn't 
+		// happen unless something goes wrong.
+		case NO_MATCH: {
+			return RegexInfo.matchNone();
+		}
+		// The following cases are treated as 
+		// a regex that matches an empty string.
+		case EMPTY_MATCH:
+		case WORD_BOUNDARY:	case NO_WORD_BOUNDARY:
+		case BEGIN_LINE: 	case END_LINE:
+		case BEGIN_TEXT: 	case END_TEXT: {
+			return RegexInfo.emptyString();
+		}
+		// A regex that matches any character
+		case ANY_CHAR: case ANY_CHAR_NOT_NL: {
+			return RegexInfo.anyChar();
+		}
+		// TODO finish for every case
+		case ALTERNATE:
+			//TODO
+			return RegexInfo.matchAny();
+		case CAPTURE:
+			//TODO
+			return RegexInfo.matchAny();
+		case CHAR_CLASS:
+			//TODO
+			return RegexInfo.matchAny();
+		case CONCAT:
+			//TODO
+			return RegexInfo.matchAny();
+		case LEFT_PAREN:
+			//TODO
+			return RegexInfo.matchAny();
+		case LITERAL:
+			//TODO
+			return RegexInfo.matchAny();
+		// A regex that indicates one or more occurrences of an expression.
+		case PLUS:
+			// The regexInfo of "(expr)+" should be the same as the info of "expr", 
+			// except that "exact" is null, because we don't know the number of repetitions.
+			RegexInfo info = analyze(re.getSubs()[0]);
+			info.exact = null;
+			return info;
+		case QUEST:
+			//TODO
+			return RegexInfo.matchAny();
+		// A regex that indicates an expression is matched 
+		// at least min times, at most max times.
+		case REPEAT:
+			//TODO
+			return RegexInfo.matchAny();
+		// A regex that indicates zero or more occurrences of an expression.
+		case STAR:
+			return RegexInfo.matchAny();
+		case VERTICAL_BAR:
+			//TODO
+			return RegexInfo.matchAny();
+		default:
+			return RegexInfo.matchAny();
+		}
+	}
+
+}