From 0365b089c8316b56abb368b380533729d48459ef Mon Sep 17 00:00:00 2001 From: zuozhiw Date: Tue, 7 Jun 2016 19:22:39 -0700 Subject: [PATCH 1/6] add convert to DNF --- .../dataflow/regexmatch/GramBooleanQuery.java | 133 ++++++++++++++++++ .../RegexToGramQueryTranslatorTest.java | 60 ++++++-- 2 files changed, 178 insertions(+), 15 deletions(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java index fa4dcdebf9f..df213ec3af5 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java @@ -2,6 +2,7 @@ import java.util.ArrayList; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.StringJoiner; @@ -293,6 +294,138 @@ private static String toLuceneQueryString(GramBooleanQuery query) { } } } + + public boolean isEmpty() { + if (this.operandSet.size() > 0) { + return false; + } + for (GramBooleanQuery subQuery : this.subQuerySet) { + if (! subQuery.isEmpty()) { + return false; + } + } + return true; + } + + // "AND" two DNF tree + // Apply distributive laws: + // (a OR b) AND (c OR d) --> (a AND c) OR (a AND d) OR (b AND c) OR (c AND d) + private static GramBooleanQuery andDNF(GramBooleanQuery left, GramBooleanQuery right) { + if (left.isEmpty()) { + return right; + } + if (right.isEmpty()) { + return left; + } + GramBooleanQuery resultQuery = new GramBooleanQuery(QueryOp.OR); + for (String leftOperand : left.operandSet) { + for (String rightOperand : right.operandSet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.add(leftOperand); + tempQuery.operandSet.add(rightOperand); + resultQuery.subQuerySet.add(tempQuery); + } + for (GramBooleanQuery rightSubQuery : right.subQuerySet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.add(leftOperand); + tempQuery.operandSet.addAll(rightSubQuery.operandSet); + resultQuery.subQuerySet.add(tempQuery); + } + } + for (GramBooleanQuery leftSubQuery : left.subQuerySet) { + for (String rightOperand : right.operandSet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.addAll(leftSubQuery.operandSet); + tempQuery.operandSet.add(rightOperand); + resultQuery.subQuerySet.add(tempQuery); + } + for (GramBooleanQuery rightSubQuery : right.subQuerySet) { + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.addAll(leftSubQuery.operandSet); + tempQuery.operandSet.addAll(rightSubQuery.operandSet); + resultQuery.subQuerySet.add(tempQuery); + } + } + return resultQuery; + } + + // After Transforming to DNF, apply Absorption laws to simplify it + // a OR (a AND b) --> a + // Tree must be already transformed to DNF before calling this function! + public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { + GramBooleanQuery result = new GramBooleanQuery(QueryOp.OR); + result.operandSet.addAll(query.operandSet); + + + Iterator outerIterator = query.subQuerySet.iterator(); + OuterLoop: + while (outerIterator.hasNext()) { + GramBooleanQuery outerAndQuery = outerIterator.next(); + System.out.println("checking: "+outerAndQuery.operandSet); + for (String operand : query.operandSet) { + System.out.println("with: "+operand); + if (outerAndQuery.operandSet.contains(operand)) { + System.out.println("don't add"); + continue OuterLoop; + } + } + Iterator innerIterator = query.subQuerySet.iterator(); + while (innerIterator.hasNext()) { + GramBooleanQuery innerAndQuery = innerIterator.next(); + System.out.println("with: "+innerAndQuery.operandSet); + if (outerAndQuery != innerAndQuery) { + if (outerAndQuery.operandSet.containsAll(innerAndQuery.operandSet)) { + System.out.println("don't add"); + outerIterator.remove(); + continue OuterLoop; + } + } + } + // if reach this code, then add a copy of it to result + GramBooleanQuery tempQuery = new GramBooleanQuery(QueryOp.AND); + tempQuery.operandSet.addAll(outerAndQuery.operandSet); + result.subQuerySet.add(tempQuery); + } + + return query; + } + + // Transform the GramBooleanQuery tree to Disjunctive normal form (DNF) + // which is OR of different ANDs + public static GramBooleanQuery toDNF(GramBooleanQuery query) { + if (query.operator == QueryOp.AND) { + GramBooleanQuery firstOrNode = new GramBooleanQuery(QueryOp.OR); + if (query.operandSet.size() != 0) { + GramBooleanQuery firstAndNode = new GramBooleanQuery(QueryOp.AND); + firstAndNode.operandSet.addAll(query.operandSet); + firstOrNode.subQuerySet.add(firstAndNode); + } + + ArrayList subDNFList = new ArrayList<>(); + for (GramBooleanQuery subQuery : query.subQuerySet) { + subDNFList.add(toDNF(subQuery)); + } + + GramBooleanQuery result = subDNFList.stream().reduce(firstOrNode, (left, right) -> andDNF(left, right)); + return result; + } else if (query.operator == QueryOp.OR) { + GramBooleanQuery result = new GramBooleanQuery(QueryOp.OR); + result.operandSet.addAll(query.operandSet); + for (GramBooleanQuery subQuery : query.subQuerySet) { + GramBooleanQuery newSubQuery = toDNF(subQuery); + result.subQuerySet.addAll(newSubQuery.subQuerySet); + result.operandSet.addAll(newSubQuery.operandSet); + } + return result; + } + + // ANY or NONE, no need to simplify + return query; + } + + + + } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java index 4d1205bef3b..0e48c0a86d0 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java @@ -17,81 +17,111 @@ private void printTranslatorResult(String regex) { System.out.println("regex: "+regex); System.out.println("boolean expression: "+exactQuery.getLuceneQueryString()); + System.out.println("query tree: "); System.out.println(exactQuery.printQueryTree()); System.out.println(); + + System.out.println("DNF: "); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + System.out.println(dnf.printQueryTree()); + System.out.println(); + + System.out.println("Simplified DNF: "); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + System.out.println(simplifiedDNF.printQueryTree()); + System.out.println(); + + System.out.println(); } @Test public void testEmptyRegex() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(""); + String regex = ""; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testStarRegex() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("a*"); + String regex = "a*"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testLiteral1() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("abc"); + String regex = "abc"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"abc"})); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testLiteral2() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("ab"); + String regex = "ab"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); expectedQuery.operandSet.addAll(Arrays.asList(new String[]{})); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testLiteral3() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("abcd"); + String regex = "abcd"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"abc", "bcd"})); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testLiteral4() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("ucirvine"); + String regex = "ucirvine"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"uci", "cir", "irv", "rvi", "vin", "ine"})); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testLiteral5() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("textdb"); + String regex = "textdb"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"tex", "ext", "xtd", "tdb"})); - + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } @Test public void testCharClass1() { - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate("[a-b][c-d][e-f]"); + String regex = "[a-b][c-d][e-f]"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); GramBooleanQuery expectedQueryOrLevel = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); expectedQueryOrLevel.operandSet.addAll(Arrays.asList( new String[]{"ace", "acf", "bce", "bcf", "ade", "adf", "bde", "bdf"})); expectedQuery.subQuerySet.add(expectedQueryOrLevel); + printTranslatorResult(regex); + Assert.assertTrue(exactQuery.equals(expectedQuery)); } From 7c9fa560a38f8127e88140e8b1a92189e15908c9 Mon Sep 17 00:00:00 2001 From: zuozhiw Date: Tue, 7 Jun 2016 19:26:46 -0700 Subject: [PATCH 2/6] remove print statements --- .../ics/textdb/dataflow/regexmatch/GramBooleanQuery.java | 5 ----- .../regexmatch/RegexToGramQueryTranslatorTest.java | 7 +------ 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java index df213ec3af5..f865a8a71d4 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java @@ -361,21 +361,16 @@ public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { OuterLoop: while (outerIterator.hasNext()) { GramBooleanQuery outerAndQuery = outerIterator.next(); - System.out.println("checking: "+outerAndQuery.operandSet); for (String operand : query.operandSet) { - System.out.println("with: "+operand); if (outerAndQuery.operandSet.contains(operand)) { - System.out.println("don't add"); continue OuterLoop; } } Iterator innerIterator = query.subQuerySet.iterator(); while (innerIterator.hasNext()) { GramBooleanQuery innerAndQuery = innerIterator.next(); - System.out.println("with: "+innerAndQuery.operandSet); if (outerAndQuery != innerAndQuery) { if (outerAndQuery.operandSet.containsAll(innerAndQuery.operandSet)) { - System.out.println("don't add"); outerIterator.remove(); continue OuterLoop; } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java index 0e48c0a86d0..21ff7161ad1 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java @@ -20,17 +20,12 @@ private void printTranslatorResult(String regex) { System.out.println("query tree: "); System.out.println(exactQuery.printQueryTree()); - System.out.println(); - System.out.println("DNF: "); GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); - System.out.println(dnf.printQueryTree()); - System.out.println(); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); System.out.println("Simplified DNF: "); - GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); System.out.println(simplifiedDNF.printQueryTree()); - System.out.println(); System.out.println(); } From 8416c34354ddc5888e29ad41e6b6ac3c7b2e7f0d Mon Sep 17 00:00:00 2001 From: zuozhiw Date: Tue, 7 Jun 2016 19:49:27 -0700 Subject: [PATCH 3/6] automate all tests --- .../RegexToGramQueryTranslatorTest.java | 249 +++++++++++++++--- 1 file changed, 206 insertions(+), 43 deletions(-) diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java index 21ff7161ad1..2bd6d46a40f 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java @@ -33,142 +33,305 @@ private void printTranslatorResult(String regex) { @Test public void testEmptyRegex() { String regex = ""; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); - printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testStarRegex() { String regex = "a*"; GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.ANY); - printTranslatorResult(regex); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testLiteral1() { String regex = "abc"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"abc"})); - printTranslatorResult(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testLiteral2() { String regex = "ab"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{})); - printTranslatorResult(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testLiteral3() { String regex = "abcd"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"abc", "bcd"})); - printTranslatorResult(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc", "bcd"})); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testLiteral4() { String regex = "ucirvine"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"uci", "cir", "irv", "rvi", "vin", "ine"})); - printTranslatorResult(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"uci", "cir", "irv", "rvi", "vin", "ine"})); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testLiteral5() { String regex = "textdb"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedQuery.operandSet.addAll(Arrays.asList(new String[]{"tex", "ext", "xtd", "tdb"})); - printTranslatorResult(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"tex", "ext", "xtd", "tdb"})); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testCharClass1() { String regex = "[a-b][c-d][e-f]"; + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - GramBooleanQuery expectedQueryOrLevel = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); - expectedQueryOrLevel.operandSet.addAll(Arrays.asList( + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + expectedQuery.operandSet.addAll(Arrays.asList( new String[]{"ace", "acf", "bce", "bcf", "ade", "adf", "bde", "bdf"})); - expectedQuery.subQuerySet.add(expectedQueryOrLevel); - printTranslatorResult(regex); +// printTranslatorResult(regex); - Assert.assertTrue(exactQuery.equals(expectedQuery)); + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } - - // We can't write expectedQuery for the following expressions, - // due to the complexity of the query itself, - // and the boolean query is not simplified and contains lots of redundant information + @Test public void testAlternate1() { - printTranslatorResult("uci|ics"); + String regex = "uci|ics"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + expectedQuery.operandSet.addAll(Arrays.asList( + new String[]{"uci", "ics"})); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testAlternate2() { - printTranslatorResult("data*(bcd|pqr)"); + String regex = "data*(bcd|pqr)"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList( + new String[]{"dat", "bcd"})); + expectedQuery.subQuerySet.add(expectedFirstAnd); + GramBooleanQuery expectedSecondAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedSecondAnd.operandSet.addAll(Arrays.asList( + new String[]{"dat", "pqr"})); + expectedQuery.subQuerySet.add(expectedSecondAnd); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + } @Test public void testPlus1() { - printTranslatorResult("abc+"); + String regex = "abc+"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testPlus2() { - printTranslatorResult("abc+pqr+"); + String regex = "abc+pqr+"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList( + new String[]{"abc", "cpq", "pqr"})); + expectedQuery.subQuerySet.add(expectedFirstAnd); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testQuest1() { - printTranslatorResult("abc?"); + String regex = "abc?"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testQuest2() { - printTranslatorResult("abc?pqr?"); + String regex = "abc?pqr?"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList( + new String[]{"abp", "bpq"})); + expectedQuery.subQuerySet.add(expectedFirstAnd); + GramBooleanQuery expectedSecondAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedSecondAnd.operandSet.addAll(Arrays.asList( + new String[]{"abc", "bcp", "cpq"})); + expectedQuery.subQuerySet.add(expectedSecondAnd); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test // RE2J will simplify REPEAT to equivalent form with QUEST. // abc{1,3} will be simplified to abcc?c? public void testRepeat1() { - printTranslatorResult("abc{1,3}"); + String regex = "abc{1,3}"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + expectedQuery.subQuerySet.add(expectedAndNode); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testCapture1() { - printTranslatorResult("(abc)(qwer)"); + String regex = "(abc)(qwer)"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); + expectedFirstAnd.operandSet.addAll(Arrays.asList( + new String[]{"abc", "bcq", "cqw", "qwe", "wer"})); + expectedQuery.subQuerySet.add(expectedFirstAnd); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } @Test public void testRegexCropUrl() { - printTranslatorResult("^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$"); + String regex = "^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$"; + + GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); + GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); + GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); + + GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); + +// printTranslatorResult(regex); + + Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); } } \ No newline at end of file From af09fda6c0a4b7f3294d818e3cd61e98070322bb Mon Sep 17 00:00:00 2001 From: zuozhiw Date: Mon, 13 Jun 2016 06:49:03 -0700 Subject: [PATCH 4/6] add examples to comments, modify assert statement, modify Arrays.asList() --- .../dataflow/regexmatch/GramBooleanQuery.java | 43 ++++++---- .../RegexToGramQueryTranslatorTest.java | 84 +++++++------------ 2 files changed, 60 insertions(+), 67 deletions(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java index f865a8a71d4..9c3f7937e48 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java @@ -307,9 +307,10 @@ public boolean isEmpty() { return true; } - // "AND" two DNF tree - // Apply distributive laws: - // (a OR b) AND (c OR d) --> (a AND c) OR (a AND d) OR (b AND c) OR (c AND d) + // "AND" two DNF trees (trees are assumed to be in DNF form) + // Apply distributive laws: + // a AND (b OR c) = (a AND b) OR (a AND c) + // (a OR b) AND (c OR d) = (a AND c) OR (a AND d) OR (b AND c) OR (c AND d) private static GramBooleanQuery andDNF(GramBooleanQuery left, GramBooleanQuery right) { if (left.isEmpty()) { return right; @@ -349,14 +350,22 @@ private static GramBooleanQuery andDNF(GramBooleanQuery left, GramBooleanQuery r return resultQuery; } - // After Transforming to DNF, apply Absorption laws to simplify it - // a OR (a AND b) --> a - // Tree must be already transformed to DNF before calling this function! + + /** + * simplify a tree, which is assumed to be already in DNF form + * Apply Absorption laws: a OR (a AND b) = a + * + * Simplifying is important because it enables comparison between two trees. + * Two equivalent trees could have different forms. However, after transforming to DNF + * and applying simplifications, their forms should be identical. + * + * @param DNFQuery + * @return simplifiedDNFQuery + */ public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { GramBooleanQuery result = new GramBooleanQuery(QueryOp.OR); result.operandSet.addAll(query.operandSet); - Iterator outerIterator = query.subQuerySet.iterator(); OuterLoop: while (outerIterator.hasNext()) { @@ -385,8 +394,19 @@ public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { return query; } - // Transform the GramBooleanQuery tree to Disjunctive normal form (DNF) - // which is OR of different ANDs + + /** + * The query tree generated by the translator is messy with possibly lots of redundant information. + * This function transforms it into Disjunctive normal form (DNF), which is an OR of different ANDs. + * + * For example, the original query of regex "abc?pqr?" is: + * (((bpq AND abp) OR (abc AND cpq AND bcp)) AND (bpq OR cpq) AND (bpq OR cpq OR (pqr AND cpq) OR (pqr AND bpq)) AND (pqr OR bpq OR cpq) AND (abc OR abp)) + * which is long and messey. After calling toDNF() and simplifyDNF(), the result query is: + * ((abp AND bpq) OR (abc AND bcp AND cpq)) + * + * @param query + * @return DNFQuery + */ public static GramBooleanQuery toDNF(GramBooleanQuery query) { if (query.operator == QueryOp.AND) { GramBooleanQuery firstOrNode = new GramBooleanQuery(QueryOp.OR); @@ -418,9 +438,4 @@ public static GramBooleanQuery toDNF(GramBooleanQuery query) { return query; } - - - - - } diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java index 2bd6d46a40f..3dc3667e674 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java @@ -12,6 +12,7 @@ public class RegexToGramQueryTranslatorTest { + // Helper function to print query tree for debugging purposes. private void printTranslatorResult(String regex) { GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); @@ -42,7 +43,7 @@ public void testEmptyRegex() { // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -56,7 +57,7 @@ public void testStarRegex() { // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -69,14 +70,15 @@ public void testLiteral1() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + expectedAndNode.operandSet.addAll(Arrays.asList("abc")); expectedQuery.subQuerySet.add(expectedAndNode); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } + // "ab" can't form a trigram, so the result is an empty OR node. @Test public void testLiteral2() { String regex = "ab"; @@ -89,7 +91,7 @@ public void testLiteral2() { // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -102,12 +104,12 @@ public void testLiteral3() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc", "bcd"})); + expectedAndNode.operandSet.addAll(Arrays.asList("abc", "bcd")); expectedQuery.subQuerySet.add(expectedAndNode); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -120,30 +122,12 @@ public void testLiteral4() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"uci", "cir", "irv", "rvi", "vin", "ine"})); + expectedAndNode.operandSet.addAll(Arrays.asList("uci", "cir", "irv", "rvi", "vin", "ine")); expectedQuery.subQuerySet.add(expectedAndNode); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); - } - - @Test - public void testLiteral5() { - String regex = "textdb"; - - GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); - GramBooleanQuery dnf = GramBooleanQuery.toDNF(exactQuery); - GramBooleanQuery simplifiedDNF = GramBooleanQuery.simplifyDNF(dnf); - - GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); - GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"tex", "ext", "xtd", "tdb"})); - expectedQuery.subQuerySet.add(expectedAndNode); - -// printTranslatorResult(regex); - - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -156,11 +140,11 @@ public void testCharClass1() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); expectedQuery.operandSet.addAll(Arrays.asList( - new String[]{"ace", "acf", "bce", "bcf", "ade", "adf", "bde", "bdf"})); + "ace", "acf", "bce", "bcf", "ade", "adf", "bde", "bdf")); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @@ -174,11 +158,11 @@ public void testAlternate1() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); expectedQuery.operandSet.addAll(Arrays.asList( - new String[]{"uci", "ics"})); + "uci", "ics")); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -191,17 +175,15 @@ public void testAlternate2() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedFirstAnd.operandSet.addAll(Arrays.asList( - new String[]{"dat", "bcd"})); + expectedFirstAnd.operandSet.addAll(Arrays.asList("dat", "bcd")); expectedQuery.subQuerySet.add(expectedFirstAnd); GramBooleanQuery expectedSecondAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedSecondAnd.operandSet.addAll(Arrays.asList( - new String[]{"dat", "pqr"})); + expectedSecondAnd.operandSet.addAll(Arrays.asList("dat", "pqr")); expectedQuery.subQuerySet.add(expectedSecondAnd); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @@ -215,12 +197,12 @@ public void testPlus1() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + expectedAndNode.operandSet.addAll(Arrays.asList("abc")); expectedQuery.subQuerySet.add(expectedAndNode); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -233,13 +215,12 @@ public void testPlus2() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedFirstAnd.operandSet.addAll(Arrays.asList( - new String[]{"abc", "cpq", "pqr"})); + expectedFirstAnd.operandSet.addAll(Arrays.asList("abc", "cpq", "pqr")); expectedQuery.subQuerySet.add(expectedFirstAnd); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -254,7 +235,7 @@ public void testQuest1() { // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -267,17 +248,15 @@ public void testQuest2() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedFirstAnd.operandSet.addAll(Arrays.asList( - new String[]{"abp", "bpq"})); + expectedFirstAnd.operandSet.addAll(Arrays.asList("abp", "bpq")); expectedQuery.subQuerySet.add(expectedFirstAnd); GramBooleanQuery expectedSecondAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedSecondAnd.operandSet.addAll(Arrays.asList( - new String[]{"abc", "bcp", "cpq"})); + expectedSecondAnd.operandSet.addAll(Arrays.asList("abc", "bcp", "cpq")); expectedQuery.subQuerySet.add(expectedSecondAnd); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -292,12 +271,12 @@ public void testRepeat1() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedAndNode = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedAndNode.operandSet.addAll(Arrays.asList(new String[]{"abc"})); + expectedAndNode.operandSet.addAll(Arrays.asList("abc")); expectedQuery.subQuerySet.add(expectedAndNode); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -310,13 +289,12 @@ public void testCapture1() { GramBooleanQuery expectedQuery = new GramBooleanQuery(GramBooleanQuery.QueryOp.OR); GramBooleanQuery expectedFirstAnd = new GramBooleanQuery(GramBooleanQuery.QueryOp.AND); - expectedFirstAnd.operandSet.addAll(Arrays.asList( - new String[]{"abc", "bcq", "cqw", "qwe", "wer"})); + expectedFirstAnd.operandSet.addAll(Arrays.asList("abc", "bcq", "cqw", "qwe", "wer")); expectedQuery.subQuerySet.add(expectedFirstAnd); // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } @Test @@ -331,7 +309,7 @@ public void testRegexCropUrl() { // printTranslatorResult(regex); - Assert.assertTrue(simplifiedDNF.equals(expectedQuery)); + Assert.assertEquals(expectedQuery, simplifiedDNF); } } \ No newline at end of file From 6dc1801c2874f1f9b68a41f191d66998d5414526 Mon Sep 17 00:00:00 2001 From: zuozhiw Date: Tue, 14 Jun 2016 19:06:13 -0700 Subject: [PATCH 5/6] add high level explanation about: 1, how to transform a tree to DNF form, 2, how transforming to DNF helps comparison of two trees --- .../dataflow/regexmatch/GramBooleanQuery.java | 24 +++++++++++-------- .../RegexToGramQueryTranslatorTest.java | 13 +++++++++- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java index 9c3f7937e48..be2658c455d 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java @@ -352,13 +352,12 @@ private static GramBooleanQuery andDNF(GramBooleanQuery left, GramBooleanQuery r /** - * simplify a tree, which is assumed to be already in DNF form + * Simplify a tree, which is assumed to be already in DNF form * Apply Absorption laws: a OR (a AND b) = a * - * Simplifying is important because it enables comparison between two trees. - * Two equivalent trees could have different forms. However, after transforming to DNF - * and applying simplifications, their forms should be identical. - * + * Simplification is extremely important, because it removes lots of redundant information, + * thus enabling comparison of two trees, + * * @param DNFQuery * @return simplifiedDNFQuery */ @@ -399,11 +398,16 @@ public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { * The query tree generated by the translator is messy with possibly lots of redundant information. * This function transforms it into Disjunctive normal form (DNF), which is an OR of different ANDs. * - * For example, the original query of regex "abc?pqr?" is: - * (((bpq AND abp) OR (abc AND cpq AND bcp)) AND (bpq OR cpq) AND (bpq OR cpq OR (pqr AND cpq) OR (pqr AND bpq)) AND (pqr OR bpq OR cpq) AND (abc OR abp)) - * which is long and messey. After calling toDNF() and simplifyDNF(), the result query is: - * ((abp AND bpq) OR (abc AND bcp AND cpq)) - * + * To transform a tree to DNF form, the following laws are applied recursively from bottom to top: + * Associative laws: (a OR b) OR c = a OR (b OR c) = a OR b OR c, when transforming OR nodes, + * Distributive laws: a AND (b OR c) = (a AND b) OR (a AND c), when transforming AND nodes, + * + * For each node, it's children will be transformed to DNF form first, then + * if it's OR, apply associative laws, if it's AND, apply distributive laws. + * Then recursively apply the same rules all the way up to the top node. + * + * The result is NOT simplified. Must call simplifyDNF() to obtain the optimal tree. + * * @param query * @return DNFQuery */ diff --git a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java index 3dc3667e674..a26551ff314 100644 --- a/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java +++ b/textdb/textdb-dataflow/src/test/java/edu/uci/ics/textdb/dataflow/regexmatch/RegexToGramQueryTranslatorTest.java @@ -12,6 +12,17 @@ public class RegexToGramQueryTranslatorTest { + /* + * We need to check equivalence of two trees, but two equivalent trees could have many different forms. + * The equals function in GramBooleanQuery only compares two trees shallowly, + * it returns true if two trees' form (and content) are identical. + * + * So we transform the tree to DNF form, and apply simplifications to remove redundant nodes. + * After transformation and simplification, two equivalent trees should have identical form. + * Then we can use the equals() function two check equivalence. + * + */ + // Helper function to print query tree for debugging purposes. private void printTranslatorResult(String regex) { GramBooleanQuery exactQuery = RegexToGramQueryTranslator.translate(regex); @@ -78,7 +89,7 @@ public void testLiteral1() { Assert.assertEquals(expectedQuery, simplifiedDNF); } - // "ab" can't form a trigram, so the result is an empty OR node. + // "ab" can't form a gram(default length 3), so the result is an empty OR node. @Test public void testLiteral2() { String regex = "ab"; From bcfa0c0a4d5cf602ffecda46c3816d611417061e Mon Sep 17 00:00:00 2001 From: zuozhiw Date: Tue, 14 Jun 2016 20:05:37 -0700 Subject: [PATCH 6/6] minor comment change --- .../uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java index be2658c455d..f646cab3b2a 100644 --- a/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java +++ b/textdb/textdb-dataflow/src/main/java/edu/uci/ics/textdb/dataflow/regexmatch/GramBooleanQuery.java @@ -402,7 +402,7 @@ public static GramBooleanQuery simplifyDNF(GramBooleanQuery query) { * Associative laws: (a OR b) OR c = a OR (b OR c) = a OR b OR c, when transforming OR nodes, * Distributive laws: a AND (b OR c) = (a AND b) OR (a AND c), when transforming AND nodes, * - * For each node, it's children will be transformed to DNF form first, then + * For each node, its children will be transformed to DNF form first, then * if it's OR, apply associative laws, if it's AND, apply distributive laws. * Then recursively apply the same rules all the way up to the top node. *