-
Notifications
You must be signed in to change notification settings - Fork 76
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #100 from TextDB/team3-regex-trigram
- Loading branch information
Showing
5 changed files
with
526 additions
and
0 deletions.
There are no files selected for viewing
181 changes: 181 additions & 0 deletions
181
...extdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
package edu.uci.ics.textdb.dataflow.regexmatch; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.StringJoiner; | ||
|
||
import edu.uci.ics.textdb.common.constants.DataConstants; | ||
|
||
|
||
class GramBooleanQuery { | ||
enum QueryOp { | ||
NONE, // doesn't match any string | ||
ANY, // matches any string | ||
|
||
AND, | ||
OR | ||
} | ||
QueryOp operator; | ||
List<String> operandList; | ||
List<GramBooleanQuery> subQueryList; | ||
|
||
int gramLength; | ||
|
||
/** | ||
* Constructs a GramBooleanQuery with default gram length 3. <br> | ||
* @param operator | ||
*/ | ||
GramBooleanQuery(QueryOp operator) { | ||
this(operator, 3); | ||
} | ||
|
||
GramBooleanQuery(QueryOp operator, int gramLength) { | ||
this.operator = operator; | ||
operandList = new ArrayList<String>(); | ||
subQueryList = new ArrayList<GramBooleanQuery>(); | ||
this.gramLength = gramLength; | ||
} | ||
|
||
/** | ||
* This returns a GramBooleanQuery's hash code. <br> | ||
* It won't traverse the whole tree, instead, | ||
* it only calculates the hashcode of direct leafs. <br> | ||
* | ||
*/ | ||
@Override | ||
public int hashCode() { | ||
int hashCode = operator.toString().hashCode(); | ||
for (String s : operandList) { | ||
hashCode = hashCode ^ s.hashCode(); | ||
} | ||
return hashCode; | ||
} | ||
|
||
/** | ||
* This overrides "equals" function. Whenever a GramBooleanQUery | ||
* object is compared to another object, this function will be called. <br> | ||
* It recursively traverses the query tree and compares | ||
* the set of sub-queries (order doesn't matter). <br> | ||
* It internally uses a HashSet to compare sub-queries. <br> | ||
*/ | ||
@Override | ||
public boolean equals(Object compareTo) { | ||
if (! (compareTo instanceof GramBooleanQuery)) { | ||
return false; | ||
} | ||
|
||
GramBooleanQuery query = (GramBooleanQuery) compareTo; | ||
if (this.operator != query.operator | ||
|| this.operandList.size() != query.operandList.size() | ||
|| this.subQueryList.size() != query.subQueryList.size()) { | ||
return false; | ||
} | ||
|
||
Set<String> operandSet = new HashSet<String>(this.operandList); | ||
if (!operandSet.equals(new HashSet<String>(query.operandList))) { | ||
return false; | ||
} | ||
|
||
Set<GramBooleanQuery> subQuerySet = new HashSet<GramBooleanQuery>(this.subQueryList); | ||
if (!subQuerySet.equals(new HashSet<GramBooleanQuery>(query.subQueryList))) { | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
/** | ||
* This method takes a list of strings and adds them to the query tree. <br> | ||
* For example, if the list is {abcd, wxyz}, then: <br> | ||
* trigrams({abcd, wxyz}) = trigrams(abcd) OR trigrams(wxyz) <br> | ||
* OR operator is assumed for a list of strings. <br> | ||
* @param list, a list of strings to be added into query. | ||
*/ | ||
void add(List<String> list) { | ||
addOrNode(list); | ||
} | ||
|
||
private void addOrNode(List<String> literalList) { | ||
GramBooleanQuery query = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); | ||
for (String literal : literalList) { | ||
query.addAndNode(literal); | ||
} | ||
this.subQueryList.add(query); | ||
} | ||
|
||
/** | ||
* This method takes a single string and adds it to the query tree. <br> | ||
* The string is converted to multiple n-grams with an AND operator. <br> | ||
* For example: if the string is abcd, then: <br> | ||
* trigrams(abcd) = abc AND bcd <br> | ||
* AND operator is assumed for a single string. <br> | ||
* @param literal | ||
*/ | ||
private void addAndNode(String literal) { | ||
GramBooleanQuery query = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); | ||
for (String nGram: literalToNGram(literal)) { | ||
query.operandList.add(nGram); | ||
} | ||
this.subQueryList.add(query); | ||
} | ||
|
||
/** | ||
* This function builds a list of N-Grams that a given literal contains. <br> | ||
* If the length of the literal is smaller than N, it returns an empty list. <br> | ||
* For example, for literal "textdb", its tri-gram list should be ["tex", "ext", "xtd", "tdb"] | ||
* @param literal | ||
* @return | ||
*/ | ||
private List<String> literalToNGram(String literal) { | ||
ArrayList<String> nGrams = new ArrayList<>(); | ||
if (literal.length() >= gramLength) { | ||
for (int i = 0; i <= literal.length()-gramLength; ++i) { | ||
nGrams.add(literal.substring(i, i+gramLength)); | ||
} | ||
} | ||
return nGrams; | ||
} | ||
|
||
/** | ||
* @return boolean expression | ||
*/ | ||
public String toString() { | ||
return this.getLuceneQueryString(); | ||
} | ||
|
||
/** | ||
* This function recursively connects | ||
* operand in {@code operandList} and subqueries in {@code subqueryList} | ||
* with {@code operator}. <br> | ||
* It generates a string representing the query that can be directly parsed by Lucene. | ||
* @return boolean expression | ||
*/ | ||
String getLuceneQueryString() { | ||
if (operator == QueryOp.ANY) { | ||
return DataConstants.SCAN_QUERY; | ||
} else if (operator == QueryOp.NONE) { | ||
return ""; | ||
} else { | ||
StringJoiner joiner = new StringJoiner( | ||
(operator == QueryOp.AND) ? " AND " : " OR "); | ||
for (String operand : operandList) { | ||
joiner.add(operand); | ||
} | ||
for (GramBooleanQuery subQuery : subQueryList) { | ||
String subQueryStr = subQuery.getLuceneQueryString(); | ||
if (! subQueryStr.equals("")) | ||
joiner.add(subQueryStr); | ||
} | ||
|
||
if (joiner.length() == 0) { | ||
return ""; | ||
} else { | ||
return "("+joiner.toString()+")"; | ||
} | ||
} | ||
} | ||
|
||
|
||
} |
82 changes: 82 additions & 0 deletions
82
textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package edu.uci.ics.textdb.dataflow.regexmatch; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* @Author Zuozhi Wang | ||
* @Author Shuying Lai | ||
* | ||
* RegexInfo for translating regex to an n-gram boolean query. <br> | ||
* see <a href='https://swtch.com/~rsc/regexp/regexp4.html'>https://swtch.com/~rsc/regexp/regexp4.html</a> for details. <br> | ||
*/ | ||
class RegexInfo { | ||
boolean emptyable; | ||
List<String> exact = null; | ||
List<String> prefix = null; | ||
List<String> suffix = null; | ||
GramBooleanQuery match = null; | ||
|
||
/** | ||
* This initializes RegexInfo: | ||
* emptyable to false | ||
* exact, prefix, suffix to empty arraylist | ||
* match to match ALL | ||
*/ | ||
RegexInfo() { | ||
emptyable = false; | ||
exact = new ArrayList<String>(); | ||
prefix = new ArrayList<String>(); | ||
suffix = new ArrayList<String>(); | ||
match = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); | ||
} | ||
|
||
/** | ||
* @return RegexInfo describing a regex that matches NO string | ||
* This function shouldn't be called unless something goes wrong. | ||
* It is used to handle error cases. | ||
*/ | ||
static RegexInfo matchNone() { | ||
RegexInfo regexInfo = new RegexInfo(); | ||
regexInfo.match.operator = GramBooleanQuery.QueryOp.NONE; | ||
return regexInfo; | ||
} | ||
|
||
/** | ||
* | ||
* @return RegexInfo describing a regex that matches ANY string | ||
*/ | ||
static RegexInfo matchAny() { | ||
RegexInfo regexInfo = new RegexInfo(); | ||
regexInfo.emptyable = true; | ||
regexInfo.prefix.add(""); | ||
regexInfo.suffix.add(""); | ||
regexInfo.match.operator = GramBooleanQuery.QueryOp.ANY; | ||
return regexInfo; | ||
} | ||
|
||
/** | ||
* | ||
* @return RegexInfo describing a regex that matches an EMPTY string | ||
*/ | ||
static RegexInfo emptyString() { | ||
|
||
RegexInfo regexInfo = new RegexInfo(); | ||
regexInfo.emptyable = true; | ||
regexInfo.match.operator = GramBooleanQuery.QueryOp.ANY; | ||
regexInfo.exact.add(""); | ||
return regexInfo; | ||
} | ||
|
||
/** | ||
* @return RegexInfo describing a regex that matches ANY SINGLE character | ||
* For anyChar, prefix, suffix, and exact are null (unknown), | ||
* because we don't know the exact character. | ||
*/ | ||
static RegexInfo anyChar() { | ||
RegexInfo regexInfo = new RegexInfo(); | ||
regexInfo.emptyable = false; | ||
return regexInfo; | ||
} | ||
|
||
} |
124 changes: 124 additions & 0 deletions
124
...flow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslator.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package edu.uci.ics.textdb.dataflow.regexmatch; | ||
|
||
import com.google.re2j.PublicParser; | ||
import com.google.re2j.PublicRE2; | ||
import com.google.re2j.PublicRegexp; | ||
import com.google.re2j.PublicSimplify; | ||
|
||
/** | ||
* This class translates a regex to a boolean query of n-grams, | ||
* according to the <a href='https://swtch.com/~rsc/regexp/regexp4.html'>algorithm</a> | ||
* described in Russ Cox's article. <br> | ||
* | ||
* @Author Zuozhi Wang | ||
* @Author Shuying Lai | ||
* | ||
*/ | ||
public class RegexToGramQueryTranslator { | ||
|
||
/** | ||
* This method translates a regular expression to | ||
* a boolean expression of n-grams. <br> | ||
* Then the boolean expression can be queried using | ||
* an n-gram inverted index to speed up regex matching. <br> | ||
* | ||
* @param regex, the regex string to be translated. | ||
* @return GamBooleanQeruy, a boolean query of n-grams. | ||
*/ | ||
public static GramBooleanQuery translate(String regex) { | ||
// try to parse using RE2J | ||
try { | ||
PublicRegexp re = PublicParser.parse(regex, PublicRE2.PERL); | ||
re = PublicSimplify.simplify(re); | ||
RegexInfo regexInfo = analyze(re); | ||
return regexInfo.match; | ||
// if RE2J parsing fails | ||
} catch (com.google.re2j.PatternSyntaxException re2j_e) { | ||
// try to parse using Java Regex | ||
// if succeeds, return matchAll (scan based) | ||
try { | ||
java.util.regex.Pattern.compile(regex); | ||
return RegexInfo.matchAny().match; | ||
// if Java Regex fails too, return matchNone (not a regex) | ||
} catch (java.util.regex.PatternSyntaxException java_e) { | ||
return RegexInfo.matchNone().match; | ||
} | ||
} | ||
} | ||
|
||
|
||
/** | ||
* This is the main function of analyzing a regular expression. <br> | ||
* This methods walks through the regex abstract syntax tree generated by RE2J, | ||
* and | ||
* | ||
* @param PublicRegexp | ||
* @return RegexInfo | ||
*/ | ||
private static RegexInfo analyze(PublicRegexp re) { | ||
|
||
switch (re.getOp()) { | ||
// NO_MATCH is a regex that doesn't match anything. | ||
// It's used to handle error cases, which shouldn't | ||
// happen unless something goes wrong. | ||
case NO_MATCH: { | ||
return RegexInfo.matchNone(); | ||
} | ||
// The following cases are treated as | ||
// a regex that matches an empty string. | ||
case EMPTY_MATCH: | ||
case WORD_BOUNDARY: case NO_WORD_BOUNDARY: | ||
case BEGIN_LINE: case END_LINE: | ||
case BEGIN_TEXT: case END_TEXT: { | ||
return RegexInfo.emptyString(); | ||
} | ||
// A regex that matches any character | ||
case ANY_CHAR: case ANY_CHAR_NOT_NL: { | ||
return RegexInfo.anyChar(); | ||
} | ||
// TODO finish for every case | ||
case ALTERNATE: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
case CAPTURE: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
case CHAR_CLASS: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
case CONCAT: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
case LEFT_PAREN: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
case LITERAL: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
// A regex that indicates one or more occurrences of an expression. | ||
case PLUS: | ||
// The regexInfo of "(expr)+" should be the same as the info of "expr", | ||
// except that "exact" is null, because we don't know the number of repetitions. | ||
RegexInfo info = analyze(re.getSubs()[0]); | ||
info.exact = null; | ||
return info; | ||
case QUEST: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
// A regex that indicates an expression is matched | ||
// at least min times, at most max times. | ||
case REPEAT: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
// A regex that indicates zero or more occurrences of an expression. | ||
case STAR: | ||
return RegexInfo.matchAny(); | ||
case VERTICAL_BAR: | ||
//TODO | ||
return RegexInfo.matchAny(); | ||
default: | ||
return RegexInfo.matchAny(); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.