diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java index fa4dcdebf9f..f646cab3b2a 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java @@ -2,6 +2,7 @@ import java.util.ArrayList; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.StringJoiner; @@ -293,6 +294,152 @@ private static String toLuceneQueryString(GramBooleanQuery query) { } } } + + public boolean isEmpty() { + if (this.operandSet.size() > 0) { + return false; + } + for (GramBooleanQuery subQuery : this.subQuerySet) { + if (! subQuery.isEmpty()) { + return false; + } + } + return true; + } + + // "AND" two DNF trees (trees are assumed to be in DNF form) + // Apply distributive laws: + // a AND (b OR c) = (a AND b) OR (a AND c) + // (a OR b) AND (c OR d) = (a AND c) OR (a AND d) OR (b AND c) OR (c AND d) + private static GramBooleanQuery andDNF(GramBooleanQuery left, GramBooleanQuery right) { + if (left.isEmpty()) { + return right; + } + if (right.isEmpty()) { + return left; + } + GramBooleanQuery resultQuery = new GramBooleanQuery(QueryOp.OR); + for (String leftOperand : left.operandSet) { + for (String rightOperand : right.operandSet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.add(leftOperand); + tempQuery.operandSet.add(rightOperand); + resultQuery.subQuerySet.add(tempQuery); + } + for (GramBooleanQuery rightSubQuery : right.subQuerySet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.add(leftOperand); + tempQuery.operandSet.addAll(rightSubQuery.operandSet); + resultQuery.subQuerySet.add(tempQuery); + } + } + for (GramBooleanQuery leftSubQuery : left.subQuerySet) { + for (String rightOperand : right.operandSet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.addAll(leftSubQuery.operandSet); + tempQuery.operandSet.add(rightOperand); + resultQuery.subQuerySet.add(tempQuery); + } + for (GramBooleanQuery rightSubQuery : right.subQuerySet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.addAll(leftSubQuery.operandSet); + tempQuery.operandSet.addAll(rightSubQuery.operandSet); + resultQuery.subQuerySet.add(tempQuery); + } + } + return resultQuery; + } + + + /** + * Simplify a tree, which is assumed to be already in DNF form + * Apply Absorption laws: a OR (a AND b) = a + * + * Simplification is extremely important, because it removes lots of redundant information, + * thus enabling comparison of two trees, + * + * @param DNFQuery + * @return simplifiedDNFQuery + */ + public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { + GramBooleanQuery result = new GramBooleanQuery(QueryOp.OR); + result.operandSet.addAll(query.operandSet); + + Iterator outerIterator = query.subQuerySet.iterator(); + OuterLoop: + while (outerIterator.hasNext()) { + GramBooleanQuery outerAndQuery = outerIterator.next(); + for (String operand : query.operandSet) { + if (outerAndQuery.operandSet.contains(operand)) { + continue OuterLoop; + } + } + Iterator innerIterator = query.subQuerySet.iterator(); + while (innerIterator.hasNext()) { + GramBooleanQuery innerAndQuery = innerIterator.next(); + if (outerAndQuery != innerAndQuery) { + if (outerAndQuery.operandSet.containsAll(innerAndQuery.operandSet)) { + outerIterator.remove(); + continue OuterLoop; + } + } + } + // if reach this code, then add a copy of it to result + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.addAll(outerAndQuery.operandSet); + result.subQuerySet.add(tempQuery); + } + + return query; + } + + /** + * The query tree generated by the translator is messy with possibly lots of redundant information. + * This function transforms it into Disjunctive normal form (DNF), which is an OR of different ANDs. + * + * To transform a tree to DNF form, the following laws are applied recursively from bottom to top: + * Associative laws: (a OR b) OR c = a OR (b OR c) = a OR b OR c, when transforming OR nodes, + * Distributive laws: a AND (b OR c) = (a AND b) OR (a AND c), when transforming AND nodes, + * + * For each node, its children will be transformed to DNF form first, then + * if it's OR, apply associative laws, if it's AND, apply distributive laws. + * Then recursively apply the same rules all the way up to the top node. + * + * The result is NOT simplified. Must call simplifyDNF() to obtain the optimal tree. + * + * @param query + * @return DNFQuery + */ + public static GramBooleanQuery toDNF(GramBooleanQuery query) { + if (query.operator == QueryOp.AND) { + GramBooleanQuery firstOrNode = new GramBooleanQuery(QueryOp.OR); + if (query.operandSet.size() != 0) { + GramBooleanQuery firstAndNode = new GramBooleanQuery(QueryOp.AND); + firstAndNode.operandSet.addAll(query.operandSet); + firstOrNode.subQuerySet.add(firstAndNode); + } + + ArrayList subDNFList = new ArrayList<>(); + for (GramBooleanQuery subQuery : query.subQuerySet) { + subDNFList.add(toDNF(subQuery)); + } + + GramBooleanQuery result = subDNFList.stream().reduce(firstOrNode, (left, right) -> andDNF(left, right)); + return result; + } else if (query.operator == QueryOp.OR) { + GramBooleanQuery result = new GramBooleanQuery(QueryOp.OR); + result.operandSet.addAll(query.operandSet); + for (GramBooleanQuery subQuery : query.subQuerySet) { + GramBooleanQuery newSubQuery = toDNF(subQuery); + result.subQuerySet.addAll(newSubQuery.subQuerySet); + result.operandSet.addAll(newSubQuery.operandSet); + } + return result; + } + + // ANY or NONE, no need to simplify + return query; + } } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java index 4d1205bef3b..a26551ff314 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java @@ -12,138 +12,315 @@ public class RegexToGramQueryTranslatorTest { + /* + * We need to check equivalence of two trees, but two equivalent trees could have many different forms. + * The equals function in GramBooleanQuery only compares two trees shallowly, + * it returns true if two trees' form (and content) are identical. + * + * So we transform the tree to DNF form, and apply simplifications to remove redundant nodes. + * After transformation and simplification, two equivalent trees should have identical form. + * Then we can use the equals() function two check equivalence. + * + */ + + // Helper function to print query tree for debugging purposes. private void printTranslatorResult(String regex) { GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); System.out.println("regex: "+regex); System.out.println("boolean expression: "+exactQuery.getLuceneQueryString()); + System.out.println("query tree: "); System.out.println(exactQuery.printQueryTree()); + + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + System.out.println("Simplified DNF: "); + System.out.println(simplifiedDNF.printQueryTree()); + System.out.println(); } @Test public void testEmptyRegex() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(""); + String regex = ""; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testStarRegex() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("a*"); + String regex = "a*"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); - Assert.assertTrue(exactQuery.equals(expectedQuery)); +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testLiteral1() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("abc"); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + String regex = "abc"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList("abc")); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } + // "ab" can't form a gram(default length 3), so the result is an empty OR node. @Test public void testLiteral2() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("ab"); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{})); + String regex = "ab"; - Assert.assertTrue(exactQuery.equals(expectedQuery)); + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testLiteral3() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("abcd"); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"abc", "bcd"})); + String regex = "abcd"; - Assert.assertTrue(exactQuery.equals(expectedQuery)); + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList("abc", "bcd")); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testLiteral4() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("ucirvine"); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"uci", "cir", "irv", "rvi", "vin", "ine"})); + String regex = "ucirvine"; - Assert.assertTrue(exactQuery.equals(expectedQuery)); - } - - @Test - public void testLiteral5() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("textdb"); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"tex", "ext", "xtd", "tdb"})); + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList("uci", "cir", "irv", "rvi", "vin", "ine")); + expectedQuery.subQuerySet.add(expectedAndNode); - Assert.assertTrue(exactQuery.equals(expectedQuery)); +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testCharClass1() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("[a-b][c-d][e-f]"); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - GramBooleanQuery expectedQueryOrLevel = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); - expectedQueryOrLevel.operandSet.addAll(Arrays.asList( - new String[]{"ace", "acf", "bce", "bcf", "ade", "adf", "bde", "bdf"})); - expectedQuery.subQuerySet.add(expectedQueryOrLevel); + String regex = "[a-b][c-d][e-f]"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + expectedQuery.operandSet.addAll(Arrays.asList( + "ace", "acf", "bce", "bcf", "ade", "adf", "bde", "bdf")); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } - - // We can't write expectedQuery for the following expressions, - // due to the complexity of the query itself, - // and the boolean query is not simplified and contains lots of redundant information + @Test public void testAlternate1() { - printTranslatorResult("uci|ics"); + String regex = "uci|ics"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + expectedQuery.operandSet.addAll(Arrays.asList( + "uci", "ics")); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testAlternate2() { - printTranslatorResult("data*(bcd|pqr)"); + String regex = "data*(bcd|pqr)"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList("dat", "bcd")); + expectedQuery.subQuerySet.add(expectedFirstAnd); + GramBooleanQuery expectedSecondAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedSecondAnd.operandSet.addAll(Arrays.asList("dat", "pqr")); + expectedQuery.subQuerySet.add(expectedSecondAnd); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); + } @Test public void testPlus1() { - printTranslatorResult("abc+"); + String regex = "abc+"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList("abc")); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testPlus2() { - printTranslatorResult("abc+pqr+"); + String regex = "abc+pqr+"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList("abc", "cpq", "pqr")); + expectedQuery.subQuerySet.add(expectedFirstAnd); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testQuest1() { - printTranslatorResult("abc?"); + String regex = "abc?"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testQuest2() { - printTranslatorResult("abc?pqr?"); + String regex = "abc?pqr?"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList("abp", "bpq")); + expectedQuery.subQuerySet.add(expectedFirstAnd); + GramBooleanQuery expectedSecondAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedSecondAnd.operandSet.addAll(Arrays.asList("abc", "bcp", "cpq")); + expectedQuery.subQuerySet.add(expectedSecondAnd); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test // RE2J will simplify REPEAT to equivalent form with QUEST. // abc{1,3} will be simplified to abcc?c? public void testRepeat1() { - printTranslatorResult("abc{1,3}"); + String regex = "abc{1,3}"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList("abc")); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testCapture1() { - printTranslatorResult("(abc)(qwer)"); + String regex = "(abc)(qwer)"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList("abc", "bcq", "cqw", "qwe", "wer")); + expectedQuery.subQuerySet.add(expectedFirstAnd); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test public void testRegexCropUrl() { - printTranslatorResult("^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$"); + String regex = "^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + +// printTranslatorResult(regex); + + Assert.assertEquals(expectedQuery, simplifiedDNF); } } \ No newline at end of file