Skip to content

Commit

Permalink
Merge pull request #4 from JohnSnowLabs/regex_matcher_fix
Browse files Browse the repository at this point in the history
RegexMatcher modifications to accept the a list of rules as Param
  • Loading branch information
saif-ellafi authored Oct 3, 2017
2 parents a331495 + ee1bab8 commit 13dcdcc
Show file tree
Hide file tree
Showing 9 changed files with 78 additions and 42 deletions.
8 changes: 4 additions & 4 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ class RegexMatcher(AnnotatorTransformer):
"strategy",
"MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE",
typeConverter=TypeConverters.toString)
rules = Param(Params._dummy(),
"rules",
rulesPath = Param(Params._dummy(),
"rulesPath",
"rules file path, must be a tuple of regex and identifier. replace config with this",
typeConverter=TypeConverters.toString)

Expand All @@ -115,8 +115,8 @@ def __init__(self):
def setStrategy(self, value):
return self._set(strategy=value)

def setRules(self, value):
return self._set(rules=value)
def setRulesPath(self, value):
return self._set(rulesPath=value)


class Lemmatizer(AnnotatorTransformer):
Expand Down
1 change: 1 addition & 0 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def runTest(self):
.setOutputCol("document")
regex_matcher = RegexMatcher() \
.setStrategy("MATCH_ALL") \
.setRulesPath("../src/test/resources/regex-matcher/rules.txt") \
.setOutputCol("regex")
assembled = document_assembler.transform(self.data)
regex_matcher.transform(assembled).show()
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/com/jsl/nlp/Finisher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Finisher(override val uid: String)
def getOutputCols: Array[String] = get(outputCols).getOrElse(getInputCols.map("finished_" + _))
def getInputCols: Array[String] = $(inputCols)
def getValueSplitSymbol: String = $(valueSplitSymbol)
def getAnnotationSpltSymbol: String = $(annotationSplitSymbol)
def getAnnotationSplitSymbol: String = $(annotationSplitSymbol)
def getCleanAnnotations: Boolean = $(cleanAnnotations)
def getIncludeKeys: Boolean = $(includeKeys)

Expand Down
33 changes: 21 additions & 12 deletions src/main/scala/com/jsl/nlp/annotators/RegexMatcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,26 @@ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
* -- MATCH_COMPLETE returns only if match is entire target.
*/
class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher] {

import com.jsl.nlp.AnnotatorType._

// ToDo: Check wether this annotator can be stored to disk as is. otherwise turn regex into string
val rules: Param[String] = new Param(this, "rules", "regex patterns to match")
lazy val defaultRules: Array[(String, String)] = ResourceHelper.retrieveRegexMatchRules()

// ToDo: Check whether this annotator can be stored to disk as is. otherwise turn regex into string

private var loadedRules: Array[(String, String)] = ResourceHelper.retrieveRegexMatchRules()
val rulesPath: Param[String] = new Param(this, "rulesPath", "File containing rules separated by commas")

val rules: Param[Array[(String, String)]] = new Param(this, "rules", "Array of rule strings separated by commas")

val strategy: Param[String] = new Param(this, "strategy", "MATCH_ALL|MATCH_FIRST|MATCH_COMPLETE")

def setRulesPath(path: String): this.type = set(rulesPath, path)

def getRulesPath: String = $(rulesPath)

def setRules(value: Array[(String, String)]): this.type = set(rules, value)

def getRules: Array[(String, String)] = $(rules)

private val matchFactory = RuleFactory.lateMatching(TransformStrategy.NO_TRANSFORM)(_)

override val annotatorType: AnnotatorType = REGEX
Expand All @@ -36,19 +46,17 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher

setDefault(inputCols, Array(DOCUMENT))

def this() = this(Identifiable.randomUID("REGEX_MATCHER"))

def getRules: Array[(String, String)] = loadedRules
setDefault(rulesPath, "__default")

def setRules(path: String): this.type = {
loadedRules = ResourceHelper.retrieveRegexMatchRules(path)
set(rules, path)
}
def this() = this(Identifiable.randomUID("REGEX_MATCHER"))

def setStrategy(value: String): this.type = set(strategy, value)

def getStrategy: String = $(strategy).toString

private def resolveRulesFromPath(): Array[(String, String)] =
ResourceHelper.retrieveRegexMatchRules($(rulesPath))

private def getFactoryStrategy: MatchStrategy = $(strategy) match {
case "MATCH_ALL" => MatchStrategy.MATCH_ALL
case "MATCH_FIRST" => MatchStrategy.MATCH_FIRST
Expand All @@ -60,7 +68,7 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
annotations.flatMap { annotation =>
matchFactory(getFactoryStrategy)
.setRules(loadedRules.map(r => new RegexRule(r._1, r._2)))
.setRules(get(rules).getOrElse(resolveRulesFromPath()).map(r => new RegexRule(r._1, r._2)))
.findMatch(annotation.metadata(AnnotatorType.DOCUMENT)).map { m =>
Annotation(
annotatorType,
Expand All @@ -72,4 +80,5 @@ class RegexMatcher(override val uid: String) extends AnnotatorModel[RegexMatcher
}
}
}

object RegexMatcher extends DefaultParamsReadable[RegexMatcher]
2 changes: 1 addition & 1 deletion src/test/resources/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ nlp {
format = "txt"
}
regexMatcher {
file = ""
file = "/regex-matcher/rules.txt"
format = "txt"
separator = ","
}
Expand Down
2 changes: 2 additions & 0 deletions src/test/resources/regex-matcher/rules.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
the\s\w+, followed by 'the'
ceremonies, "ceremony"
3 changes: 2 additions & 1 deletion src/test/scala/com/jsl/nlp/AnnotatorBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
.transform(withFullPragmaticSentenceDetector(withTokenizer(dataset)))
}

def withRegexMatcher(dataset: Dataset[Row], rules: Array[(String, String)], strategy: String): Dataset[Row] = {
def withRegexMatcher(dataset: Dataset[Row], rules: Array[(String, String)] = Array.empty[(String, String)], strategy: String): Dataset[Row] = {
val regexMatcher = new RegexMatcher()
.setStrategy(strategy)
.setInputCols(Array("document"))
.setOutputCol("regex")
if (rules.nonEmpty) regexMatcher.setRules(rules)
regexMatcher.transform(dataset)
}

Expand Down
66 changes: 44 additions & 22 deletions src/test/scala/com/jsl/nlp/annotators/RegexMatcherBehaviors.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,55 @@ import scala.language.reflectiveCalls

trait RegexMatcherBehaviors { this: FlatSpec =>
def fixture(dataset: Dataset[Row], rules: Array[(String, String)], strategy: String) = new {
val df = AnnotatorBuilder.withRegexMatcher(dataset, rules, strategy)
val regexAnnotations = df.select("regex")
val annotationDataset: Dataset[_] = AnnotatorBuilder.withRegexMatcher(dataset, rules, strategy)
val regexAnnotations: Array[Annotation] = annotationDataset.select("regex")
.collect
.flatMap { _.getSeq[Row](0) }
.map { Annotation(_) }

df.show
annotationDataset.show()
}

def predefinedRulesRegexMatcher(dataset: => Dataset[Row], rules: Array[(String, String)], strategy: String): Unit = {
// "A RegexMatcher Annotator" should s"successfuly match ${rules.map(_._1).mkString(",")}" in {
// val f = fixture(dataset, rules, strategy)
// f.regexAnnotations.foreach { a =>
// assert(a.metadata == REGEX)
// }
// }
//
// it should "create annotations" in {
// val f = fixture(dataset, rules, strategy)
// assert(f.regexAnnotations.size > 0)
// }
//
// it should "create annotations with the correct tag" in {
// val f = fixture(dataset, rules, strategy)
// f.regexAnnotations.foreach { a =>
// assert(a.annotatorType == REGEX)
// }
// }
def predefinedRulesRegexMatcher(dataset: => Dataset[Row], strategy: String): Unit = {
val rules = Array.empty[(String, String)]
"A RegexMatcher Annotator with predefined rules" should s"successfuly match" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.metadata.toArray.nonEmpty)
}
}

it should "create annotations" in {
val f = fixture(dataset, rules, strategy)
assert(f.regexAnnotations.nonEmpty)
}

it should "create annotations with the correct tag" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.annotatorType == REGEX)
}
}
}

def customizedRulesRegexMatcher(dataset: => Dataset[Row], rules: Array[(String, String)], strategy: String): Unit = {
"A RegexMatcher Annotator with custom rules" should s"successfuly match ${rules.map(_._1).mkString(",")}" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.metadata.toArray.nonEmpty)
}
}

it should "create annotations" in {
val f = fixture(dataset, rules, strategy)
assert(f.regexAnnotations.nonEmpty)
}

it should "create annotations with the correct tag" in {
val f = fixture(dataset, rules, strategy)
f.regexAnnotations.foreach { a =>
assert(a.annotatorType == REGEX)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ class RegexMatcherTestSpec extends FlatSpec with RegexMatcherBehaviors {
("the\\s\\w+", "followed by 'the'"),
("ceremonies", "ceremony")
)
"A full RegexMatcher pipeline with content" should behave like predefinedRulesRegexMatcher(df, rules, strategy)
"A full RegexMatcher pipeline with content" should behave like customizedRulesRegexMatcher(df, rules, strategy)
it should behave like predefinedRulesRegexMatcher(df, strategy)
}

0 comments on commit 13dcdcc

Please sign in to comment.