Skip to content

Commit

Permalink
Merge pull request #100 from TextDB/team3-regex-trigram
Browse files Browse the repository at this point in the history
[Issue #30 #99] Team3 regex trigram
  • Loading branch information
laisycs committed May 17, 2016
2 parents d464d88 + 5d957c7 commit fc894b7
Show file tree
Hide file tree
Showing 5 changed files with 526 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package edu.uci.ics.textdb.dataflow.regexmatch;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringJoiner;

import edu.uci.ics.textdb.common.constants.DataConstants;


class GramBooleanQuery {
enum QueryOp {
NONE, // doesn't match any string
ANY, // matches any string

AND,
OR
}
QueryOp operator;
List<String> operandList;
List<GramBooleanQuery> subQueryList;

int gramLength;

/**
* Constructs a GramBooleanQuery with default gram length 3. <br>
* @param operator
*/
GramBooleanQuery(QueryOp operator) {
this(operator, 3);
}

GramBooleanQuery(QueryOp operator, int gramLength) {
this.operator = operator;
operandList = new ArrayList<String>();
subQueryList = new ArrayList<GramBooleanQuery>();
this.gramLength = gramLength;
}

/**
* This returns a GramBooleanQuery's hash code. <br>
* It won't traverse the whole tree, instead,
* it only calculates the hashcode of direct leafs. <br>
*
*/
@Override
public int hashCode() {
int hashCode = operator.toString().hashCode();
for (String s : operandList) {
hashCode = hashCode ^ s.hashCode();
}
return hashCode;
}

/**
* This overrides "equals" function. Whenever a GramBooleanQUery
* object is compared to another object, this function will be called. <br>
* It recursively traverses the query tree and compares
* the set of sub-queries (order doesn't matter). <br>
* It internally uses a HashSet to compare sub-queries. <br>
*/
@Override
public boolean equals(Object compareTo) {
if (! (compareTo instanceof GramBooleanQuery)) {
return false;
}

GramBooleanQuery query = (GramBooleanQuery) compareTo;
if (this.operator != query.operator
|| this.operandList.size() != query.operandList.size()
|| this.subQueryList.size() != query.subQueryList.size()) {
return false;
}

Set<String> operandSet = new HashSet<String>(this.operandList);
if (!operandSet.equals(new HashSet<String>(query.operandList))) {
return false;
}

Set<GramBooleanQuery> subQuerySet = new HashSet<GramBooleanQuery>(this.subQueryList);
if (!subQuerySet.equals(new HashSet<GramBooleanQuery>(query.subQueryList))) {
return false;
}

return true;
}

/**
* This method takes a list of strings and adds them to the query tree. <br>
* For example, if the list is {abcd, wxyz}, then: <br>
* trigrams({abcd, wxyz}) = trigrams(abcd) OR trigrams(wxyz) <br>
* OR operator is assumed for a list of strings. <br>
* @param list, a list of strings to be added into query.
*/
void add(List<String> list) {
addOrNode(list);
}

private void addOrNode(List<String> literalList) {
GramBooleanQuery query = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR);
for (String literal : literalList) {
query.addAndNode(literal);
}
this.subQueryList.add(query);
}

/**
* This method takes a single string and adds it to the query tree. <br>
* The string is converted to multiple n-grams with an AND operator. <br>
* For example: if the string is abcd, then: <br>
* trigrams(abcd) = abc AND bcd <br>
* AND operator is assumed for a single string. <br>
* @param literal
*/
private void addAndNode(String literal) {
GramBooleanQuery query = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND);
for (String nGram: literalToNGram(literal)) {
query.operandList.add(nGram);
}
this.subQueryList.add(query);
}

/**
* This function builds a list of N-Grams that a given literal contains. <br>
* If the length of the literal is smaller than N, it returns an empty list. <br>
* For example, for literal "textdb", its tri-gram list should be ["tex", "ext", "xtd", "tdb"]
* @param literal
* @return
*/
private List<String> literalToNGram(String literal) {
ArrayList<String> nGrams = new ArrayList<>();
if (literal.length() >= gramLength) {
for (int i = 0; i <= literal.length()-gramLength; ++i) {
nGrams.add(literal.substring(i, i+gramLength));
}
}
return nGrams;
}

/**
* @return boolean expression
*/
public String toString() {
return this.getLuceneQueryString();
}

/**
* This function recursively connects
* operand in {@code operandList} and subqueries in {@code subqueryList}
* with {@code operator}. <br>
* It generates a string representing the query that can be directly parsed by Lucene.
* @return boolean expression
*/
String getLuceneQueryString() {
if (operator == QueryOp.ANY) {
return DataConstants.SCAN_QUERY;
} else if (operator == QueryOp.NONE) {
return "";
} else {
StringJoiner joiner = new StringJoiner(
(operator == QueryOp.AND) ? " AND " : " OR ");
for (String operand : operandList) {
joiner.add(operand);
}
for (GramBooleanQuery subQuery : subQueryList) {
String subQueryStr = subQuery.getLuceneQueryString();
if (! subQueryStr.equals(""))
joiner.add(subQueryStr);
}

if (joiner.length() == 0) {
return "";
} else {
return "("+joiner.toString()+")";
}
}
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package edu.uci.ics.textdb.dataflow.regexmatch;

import java.util.ArrayList;
import java.util.List;

/**
* @Author Zuozhi Wang
* @Author Shuying Lai
*
* RegexInfo for translating regex to an n-gram boolean query. <br>
* see <a href='https://swtch.com/~rsc/regexp/regexp4.html'>https://swtch.com/~rsc/regexp/regexp4.html</a> for details. <br>
*/
class RegexInfo {
boolean emptyable;
List<String> exact = null;
List<String> prefix = null;
List<String> suffix = null;
GramBooleanQuery match = null;

/**
* This initializes RegexInfo:
* emptyable to false
* exact, prefix, suffix to empty arraylist
* match to match ALL
*/
RegexInfo() {
emptyable = false;
exact = new ArrayList<String>();
prefix = new ArrayList<String>();
suffix = new ArrayList<String>();
match = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY);
}

/**
* @return RegexInfo describing a regex that matches NO string
* This function shouldn't be called unless something goes wrong.
* It is used to handle error cases.
*/
static RegexInfo matchNone() {
RegexInfo regexInfo = new RegexInfo();
regexInfo.match.operator = GramBooleanQuery.QueryOp.NONE;
return regexInfo;
}

/**
*
* @return RegexInfo describing a regex that matches ANY string
*/
static RegexInfo matchAny() {
RegexInfo regexInfo = new RegexInfo();
regexInfo.emptyable = true;
regexInfo.prefix.add("");
regexInfo.suffix.add("");
regexInfo.match.operator = GramBooleanQuery.QueryOp.ANY;
return regexInfo;
}

/**
*
* @return RegexInfo describing a regex that matches an EMPTY string
*/
static RegexInfo emptyString() {

RegexInfo regexInfo = new RegexInfo();
regexInfo.emptyable = true;
regexInfo.match.operator = GramBooleanQuery.QueryOp.ANY;
regexInfo.exact.add("");
return regexInfo;
}

/**
* @return RegexInfo describing a regex that matches ANY SINGLE character
* For anyChar, prefix, suffix, and exact are null (unknown),
* because we don't know the exact character.
*/
static RegexInfo anyChar() {
RegexInfo regexInfo = new RegexInfo();
regexInfo.emptyable = false;
return regexInfo;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package edu.uci.ics.textdb.dataflow.regexmatch;

import com.google.re2j.PublicParser;
import com.google.re2j.PublicRE2;
import com.google.re2j.PublicRegexp;
import com.google.re2j.PublicSimplify;

/**
* This class translates a regex to a boolean query of n-grams,
* according to the <a href='https://swtch.com/~rsc/regexp/regexp4.html'>algorithm</a>
* described in Russ Cox's article. <br>
*
* @Author Zuozhi Wang
* @Author Shuying Lai
*
*/
public class RegexToGramQueryTranslator {

/**
* This method translates a regular expression to
* a boolean expression of n-grams. <br>
* Then the boolean expression can be queried using
* an n-gram inverted index to speed up regex matching. <br>
*
* @param regex, the regex string to be translated.
* @return GamBooleanQeruy, a boolean query of n-grams.
*/
public static GramBooleanQuery translate(String regex) {
// try to parse using RE2J
try {
PublicRegexp re = PublicParser.parse(regex, PublicRE2.PERL);
re = PublicSimplify.simplify(re);
RegexInfo regexInfo = analyze(re);
return regexInfo.match;
// if RE2J parsing fails
} catch (com.google.re2j.PatternSyntaxException re2j_e) {
// try to parse using Java Regex
// if succeeds, return matchAll (scan based)
try {
java.util.regex.Pattern.compile(regex);
return RegexInfo.matchAny().match;
// if Java Regex fails too, return matchNone (not a regex)
} catch (java.util.regex.PatternSyntaxException java_e) {
return RegexInfo.matchNone().match;
}
}
}


/**
* This is the main function of analyzing a regular expression. <br>
* This methods walks through the regex abstract syntax tree generated by RE2J,
* and
*
* @param PublicRegexp
* @return RegexInfo
*/
private static RegexInfo analyze(PublicRegexp re) {

switch (re.getOp()) {
// NO_MATCH is a regex that doesn't match anything.
// It's used to handle error cases, which shouldn't
// happen unless something goes wrong.
case NO_MATCH: {
return RegexInfo.matchNone();
}
// The following cases are treated as
// a regex that matches an empty string.
case EMPTY_MATCH:
case WORD_BOUNDARY: case NO_WORD_BOUNDARY:
case BEGIN_LINE: case END_LINE:
case BEGIN_TEXT: case END_TEXT: {
return RegexInfo.emptyString();
}
// A regex that matches any character
case ANY_CHAR: case ANY_CHAR_NOT_NL: {
return RegexInfo.anyChar();
}
// TODO finish for every case
case ALTERNATE:
//TODO
return RegexInfo.matchAny();
case CAPTURE:
//TODO
return RegexInfo.matchAny();
case CHAR_CLASS:
//TODO
return RegexInfo.matchAny();
case CONCAT:
//TODO
return RegexInfo.matchAny();
case LEFT_PAREN:
//TODO
return RegexInfo.matchAny();
case LITERAL:
//TODO
return RegexInfo.matchAny();
// A regex that indicates one or more occurrences of an expression.
case PLUS:
// The regexInfo of "(expr)+" should be the same as the info of "expr",
// except that "exact" is null, because we don't know the number of repetitions.
RegexInfo info = analyze(re.getSubs()[0]);
info.exact = null;
return info;
case QUEST:
//TODO
return RegexInfo.matchAny();
// A regex that indicates an expression is matched
// at least min times, at most max times.
case REPEAT:
//TODO
return RegexInfo.matchAny();
// A regex that indicates zero or more occurrences of an expression.
case STAR:
return RegexInfo.matchAny();
case VERTICAL_BAR:
//TODO
return RegexInfo.matchAny();
default:
return RegexInfo.matchAny();
}
}

}
Loading

0 comments on commit fc894b7

Please sign in to comment.