Merge pull request #4 from JohnSnowLabs/regex_matcher_fix

RegexMatcher modifications to accept the a list of rules as Param
JohnSnowLabs · Oct 3, 2017 · 13dcdcc · 13dcdcc
2 parents a331495 + ee1bab8
commit 13dcdcc
Show file tree

Hide file tree

Showing 9 changed files with 78 additions and 42 deletions.
diff --git a/python/sparknlp/annotator.py b/python/sparknlp/annotator.py
@@ -102,8 +102,8 @@ class RegexMatcher(AnnotatorTransformer):
                      "strategy",
                      "MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE",
                      typeConverter=TypeConverters.toString)
-    rules = Param(Params._dummy(),
-                  "rules",
+    rulesPath = Param(Params._dummy(),
+                  "rulesPath",
                   "rules file path, must be a tuple of regex and identifier. replace config with this",
                   typeConverter=TypeConverters.toString)
 
@@ -115,8 +115,8 @@ def __init__(self):
     def setStrategy(self, value):
         return self._set(strategy=value)
 
-    def setRules(self, value):
-        return self._set(rules=value)
+    def setRulesPath(self, value):
+        return self._set(rulesPath=value)
 
 
 class Lemmatizer(AnnotatorTransformer):

diff --git a/python/test/annotators.py b/python/test/annotators.py
@@ -42,6 +42,7 @@ def runTest(self):
             .setOutputCol("document")
         regex_matcher = RegexMatcher() \
             .setStrategy("MATCH_ALL") \
+            .setRulesPath("../src/test/resources/regex-matcher/rules.txt") \
             .setOutputCol("regex")
         assembled = document_assembler.transform(self.data)
         regex_matcher.transform(assembled).show()

diff --git a/src/main/scala/com/jsl/nlp/Finisher.scala b/src/main/scala/com/jsl/nlp/Finisher.scala
@@ -35,7 +35,7 @@ class Finisher(override val uid: String)
   def getOutputCols: Array[String] = get(outputCols).getOrElse(getInputCols.map("finished_" + _))
   def getInputCols: Array[String] = $(inputCols)
   def getValueSplitSymbol: String = $(valueSplitSymbol)
-  def getAnnotationSpltSymbol: String = $(annotationSplitSymbol)
+  def getAnnotationSplitSymbol: String = $(annotationSplitSymbol)
   def getCleanAnnotations: Boolean = $(cleanAnnotations)
   def getIncludeKeys: Boolean = $(includeKeys)
 

diff --git a/src/main/scala/com/jsl/nlp/annotators/RegexMatcher.scala b/src/main/scala/com/jsl/nlp/annotators/RegexMatcher.scala
@@ -18,16 +18,26 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
   *   -- MATCH_COMPLETE returns only if match is entire target.
   */
 class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher] {
-
   import com.jsl.nlp.AnnotatorType._
 
-  // ToDo: Check wether this annotator can be stored to disk as is. otherwise turn regex into string
-  val rules: Param[String] = new Param(this, "rules", "regex patterns to match")
+  lazy val defaultRules: Array[(String, String)] = ResourceHelper.retrieveRegexMatchRules()
+
+  // ToDo: Check whether this annotator can be stored to disk as is. otherwise turn regex into string
 
-  private var loadedRules: Array[(String, String)] = ResourceHelper.retrieveRegexMatchRules()
+  val rulesPath: Param[String] = new Param(this, "rulesPath", "File containing rules separated by commas")
+
+  val rules: Param[Array[(String, String)]] = new Param(this, "rules", "Array of rule strings separated by commas")
 
   val strategy: Param[String] = new Param(this, "strategy", "MATCH_ALL|MATCH_FIRST|MATCH_COMPLETE")
 
+  def setRulesPath(path: String): this.type = set(rulesPath, path)
+
+  def getRulesPath: String = $(rulesPath)
+
+  def setRules(value: Array[(String, String)]): this.type = set(rules, value)
+
+  def getRules: Array[(String, String)] = $(rules)
+
   private val matchFactory = RuleFactory.lateMatching(TransformStrategy.NO_TRANSFORM)(_)
 
   override val annotatorType: AnnotatorType = REGEX
@@ -36,19 +46,17 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
 
   setDefault(inputCols, Array(DOCUMENT))
 
-  def this() = this(Identifiable.randomUID("REGEX_MATCHER"))
-
-  def getRules: Array[(String, String)] = loadedRules
+  setDefault(rulesPath, "__default")
 
-  def setRules(path: String): this.type = {
-    loadedRules = ResourceHelper.retrieveRegexMatchRules(path)
-    set(rules, path)
-  }
+  def this() = this(Identifiable.randomUID("REGEX_MATCHER"))
 
   def setStrategy(value: String): this.type = set(strategy, value)
 
   def getStrategy: String = $(strategy).toString
 
+  private def resolveRulesFromPath(): Array[(String, String)] =
+    ResourceHelper.retrieveRegexMatchRules($(rulesPath))
+
   private def getFactoryStrategy: MatchStrategy = $(strategy) match {
     case "MATCH_ALL" => MatchStrategy.MATCH_ALL
     case "MATCH_FIRST" => MatchStrategy.MATCH_FIRST
@@ -60,7 +68,7 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
   override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
     annotations.flatMap { annotation =>
       matchFactory(getFactoryStrategy)
-        .setRules(loadedRules.map(r => new RegexRule(r._1, r._2)))
+        .setRules(get(rules).getOrElse(resolveRulesFromPath()).map(r => new RegexRule(r._1, r._2)))
         .findMatch(annotation.metadata(AnnotatorType.DOCUMENT)).map { m =>
           Annotation(
             annotatorType,
@@ -72,4 +80,5 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
     }
   }
 }
+
 object RegexMatcher extends DefaultParamsReadable[RegexMatcher]
diff --git a/src/test/resources/application.conf b/src/test/resources/application.conf
@@ -4,7 +4,7 @@ nlp {
     format = "txt"
   }
   regexMatcher {
-    file = ""
+    file = "/regex-matcher/rules.txt"
     format = "txt"
     separator = ","
   }

diff --git a/src/test/resources/regex-matcher/rules.txt b/src/test/resources/regex-matcher/rules.txt
@@ -0,0 +1,2 @@
+the\s\w+, followed by 'the'
+ceremonies, "ceremony"
diff --git a/src/test/scala/com/jsl/nlp/AnnotatorBuilder.scala b/src/test/scala/com/jsl/nlp/AnnotatorBuilder.scala
@@ -77,11 +77,12 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
       .transform(withFullPragmaticSentenceDetector(withTokenizer(dataset)))
   }
 
-  def withRegexMatcher(dataset: Dataset[Row], rules: Array[(String, String)], strategy: String): Dataset[Row] = {
+  def withRegexMatcher(dataset: Dataset[Row], rules: Array[(String, String)] = Array.empty[(String, String)], strategy: String): Dataset[Row] = {
     val regexMatcher = new RegexMatcher()
       .setStrategy(strategy)
       .setInputCols(Array("document"))
       .setOutputCol("regex")
+    if (rules.nonEmpty) regexMatcher.setRules(rules)
     regexMatcher.transform(dataset)
   }
 

diff --git a/src/test/scala/com/jsl/nlp/annotators/RegexMatcherBehaviors.scala b/src/test/scala/com/jsl/nlp/annotators/RegexMatcherBehaviors.scala
@@ -8,33 +8,55 @@ import scala.language.reflectiveCalls
 
 trait RegexMatcherBehaviors { this: FlatSpec =>
   def fixture(dataset: Dataset[Row], rules: Array[(String, String)], strategy: String) = new {
-    val df = AnnotatorBuilder.withRegexMatcher(dataset, rules, strategy)
-    val regexAnnotations = df.select("regex")
+    val annotationDataset: Dataset[_] = AnnotatorBuilder.withRegexMatcher(dataset, rules, strategy)
+    val regexAnnotations: Array[Annotation] = annotationDataset.select("regex")
       .collect
       .flatMap { _.getSeq[Row](0) }
       .map { Annotation(_) }
 
-    df.show
+    annotationDataset.show()
   }
 
-  def predefinedRulesRegexMatcher(dataset: => Dataset[Row], rules: Array[(String, String)], strategy: String): Unit = {
-//    "A RegexMatcher Annotator" should s"successfuly match ${rules.map(_._1).mkString(",")}" in {
-//      val f = fixture(dataset, rules, strategy)
-//      f.regexAnnotations.foreach { a =>
-//        assert(a.metadata == REGEX)
-//      }
-//    }
-//
-//    it should "create annotations" in {
-//      val f = fixture(dataset, rules, strategy)
-//      assert(f.regexAnnotations.size > 0)
-//    }
-//
-//    it should "create annotations with the correct tag" in {
-//      val f = fixture(dataset, rules, strategy)
-//      f.regexAnnotations.foreach { a =>
-//        assert(a.annotatorType == REGEX)
-//      }
-//    }
+  def predefinedRulesRegexMatcher(dataset: => Dataset[Row], strategy: String): Unit = {
+    val rules = Array.empty[(String, String)]
+    "A RegexMatcher Annotator with predefined rules" should s"successfuly match" in {
+      val f = fixture(dataset, rules, strategy)
+      f.regexAnnotations.foreach { a =>
+        assert(a.metadata.toArray.nonEmpty)
+      }
+    }
+
+    it should "create annotations" in {
+      val f = fixture(dataset, rules, strategy)
+      assert(f.regexAnnotations.nonEmpty)
+    }
+
+    it should "create annotations with the correct tag" in {
+      val f = fixture(dataset, rules, strategy)
+      f.regexAnnotations.foreach { a =>
+        assert(a.annotatorType == REGEX)
+      }
+    }
+  }
+
+  def customizedRulesRegexMatcher(dataset: => Dataset[Row], rules: Array[(String, String)], strategy: String): Unit = {
+    "A RegexMatcher Annotator with custom rules" should s"successfuly match ${rules.map(_._1).mkString(",")}" in {
+      val f = fixture(dataset, rules, strategy)
+      f.regexAnnotations.foreach { a =>
+        assert(a.metadata.toArray.nonEmpty)
+      }
+    }
+
+    it should "create annotations" in {
+      val f = fixture(dataset, rules, strategy)
+      assert(f.regexAnnotations.nonEmpty)
+    }
+
+    it should "create annotations with the correct tag" in {
+      val f = fixture(dataset, rules, strategy)
+      f.regexAnnotations.foreach { a =>
+        assert(a.annotatorType == REGEX)
+      }
+    }
   }
 }
diff --git a/src/test/scala/com/jsl/nlp/annotators/RegexMatcherTestSpec.scala b/src/test/scala/com/jsl/nlp/annotators/RegexMatcherTestSpec.scala
@@ -11,5 +11,6 @@ class RegexMatcherTestSpec extends FlatSpec with RegexMatcherBehaviors {
     ("the\\s\\w+", "followed by 'the'"),
     ("ceremonies", "ceremony")
   )
-  "A full RegexMatcher pipeline with content" should behave like predefinedRulesRegexMatcher(df, rules, strategy)
+  "A full RegexMatcher pipeline with content" should behave like customizedRulesRegexMatcher(df, rules, strategy)
+  it should behave like predefinedRulesRegexMatcher(df, strategy)
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		the\s\w+, followed by 'the'
		ceremonies, "ceremony"