From aba383e2945414342c6b8c41bf84f9dc32ef1e1f Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Sun, 11 Aug 2024 08:37:04 -0500 Subject: [PATCH] [SPARKNLP-1058] Adding aggressiveMatching parameter --- .../annotator/matcher/date_matcher.py | 15 +++++++++ .../nlp/annotators/DateMatcher.scala | 15 +++++---- .../nlp/annotators/DateMatcherUtils.scala | 25 +++++++++++++- .../nlp/annotators/MultiDateMatcher.scala | 4 ++- .../nlp/annotators/DateMatcherTestSpec.scala | 33 +++++++++++++++++-- ...ultiDateMatcherMultiLanguageTestSpec.scala | 29 ++++++++++++++++ 6 files changed, 111 insertions(+), 10 deletions(-) diff --git a/python/sparknlp/annotator/matcher/date_matcher.py b/python/sparknlp/annotator/matcher/date_matcher.py index 8bf9583eef6884..3ff3b81c97f21a 100755 --- a/python/sparknlp/annotator/matcher/date_matcher.py +++ b/python/sparknlp/annotator/matcher/date_matcher.py @@ -72,6 +72,11 @@ class DateMatcherUtils(Params): "Matched Strategy to searches relaxed dates", typeConverter=TypeConverters.toString) + aggressiveMatching = Param(Params._dummy(), + "aggressiveMatching", + "Whether to aggressively attempt to find date matches, even in ambiguous or less common formats", + typeConverter=TypeConverters.toBoolean) + def setInputFormats(self, value): """Sets input formats patterns to match in the documents. @@ -177,6 +182,16 @@ def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST): """ return self._set(relaxedFactoryStrategy=matchStrategy) + def setAggressiveMatching(self, value): + """ Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats + + Parameters + ---------- + aggressiveMatching : Boolean + Whether to aggressively attempt to find date matches, even in ambiguous or less common formats + """ + return self._set(aggressiveMatching=value) + class DateMatcher(AnnotatorModel, DateMatcherUtils): """Matches standard date formats into a provided format diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala index d0ae20ad26aa9f..1509105ee87b1b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala @@ -164,12 +164,15 @@ class DateMatcher(override val uid: String) def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY) val possibleDate: Option[MatchedDateTime] = - if (inputFormatsAreDefined) - runInputFormatsSearch(_text) - else - runDateExtractorChain(_text) - - possibleDate.orElse(setTimeIfAny(possibleDate, _text)) + if (inputFormatsAreDefined) runInputFormatsSearch(_text) else runDateExtractorChain(_text) + + if (getAggressiveMatching) { + possibleDate + .orElse(runDateExtractorChain(_text)) + .orElse(setTimeIfAny(possibleDate, _text)) + } else { + possibleDate.orElse(setTimeIfAny(possibleDate, _text)) + } } private def runDateExtractorChain(_text: String) = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala index d44a921dab4076..87831f341f9dd8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala @@ -275,6 +275,28 @@ trait DateMatcherUtils extends Params { */ def getRelaxedFactoryStrategy: String = $(relaxedFactoryStrategy) + /** Whether to aggressively attempt to find date matches, even in ambiguous or less common + * formats (Default: `false`) + * + * @group param + */ + val aggressiveMatching: BooleanParam = new BooleanParam( + this, + "aggressiveMatching", + "Whether to aggressively attempt to find date matches, even in ambiguous or less common formats") + + /** To set aggressive matching Strategy + * + * @group param + */ + def setAggressiveMatching(value: Boolean): this.type = set(aggressiveMatching, value) + + /** To get aggressive matching Strategy + * + * @group param + */ + def getAggressiveMatching: Boolean = $(aggressiveMatching) + setDefault( inputFormats -> Array(""), outputFormat -> "yyyy/MM/dd", @@ -284,7 +306,8 @@ trait DateMatcherUtils extends Params { readMonthFirst -> true, defaultDayWhenMissing -> 1, sourceLanguage -> "en", - relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString) + relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString, + aggressiveMatching -> false) protected val formalFactoryInputFormats = new RuleFactory(MatchStrategy.MATCH_ALL) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala index b711a532c63619..69018bccd1ddd2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala @@ -194,7 +194,9 @@ class MultiDateMatcher(override val uid: String) else runDateExtractorChain(_text) - possibleDates + if (getAggressiveMatching && possibleDates.isEmpty) { + runDateExtractorChain(_text) + } else possibleDates } private def extractRelativeDateFuture(text: String): Seq[MatchedDateTime] = { diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/DateMatcherTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/DateMatcherTestSpec.scala index bb5e9410b486cb..4608b932f15f9c 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/DateMatcherTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/DateMatcherTestSpec.scala @@ -19,9 +19,8 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.DATE import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DataBuilder} import com.johnsnowlabs.tags.FastTest - +import org.apache.spark.ml.Pipeline import org.apache.spark.sql.{Dataset, Row} - import org.scalatest.flatspec.AnyFlatSpec import java.util.Calendar @@ -347,4 +346,34 @@ class DateMatcherTestSpec extends AnyFlatSpec with DateMatcherBehaviors { assert(results == expectedDates) } + + "a DataMatcher" should "make a more forceful or proactive approach in finding dates when aggressive match is set" in { + + val data = DataBuilder.basicDataBuild( + "See you on next monday.", + "I was born at 01/03/98", + "She was born on 02/03/1966.", + "The project started yesterday and will finish next year.", + "She will graduate by July 2023.", + "She will visit doctor tomorrow and next month again.") + + val multiDate = new DateMatcher() + .setInputCols(Array("document")) + .setReadMonthFirst(false) + .setOutputCol("date") + .setInputFormats(Array("dd/MM/yyyy")) + .setOutputFormat("dd/MM/yyyy") + .setAggressiveMatching(true) + + val pipeline = new Pipeline().setStages(Array(multiDate)) + + val annotated = pipeline.fit(data).transform(data) + val collectResult = annotated.select("date").collect() + + collectResult.foreach { result => + val annotations = Annotation.getAnnotations(result, "date") + assert(annotations.nonEmpty) + } + } + } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherMultiLanguageTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherMultiLanguageTestSpec.scala index 4e73e951a61c9f..5e379845053ddc 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherMultiLanguageTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherMultiLanguageTestSpec.scala @@ -479,4 +479,33 @@ class MultiDateMatcherMultiLanguageTestSpec extends AnyFlatSpec with DateMatcher assert(results.contains(getOneDayAgoDate()) && results.contains(getInTwoWeeksDate())) } + "a DataMatcher" should "make a more forceful or proactive approach in finding dates when aggressive match is set" in { + + val data = DataBuilder.basicDataBuild( + "See you on next monday.", + "I was born at 01/03/98", + "She was born on 02/03/1966.", + "The project started yesterday and will finish next year.", + "She will graduate by July 2023.", + "She will visit doctor tomorrow and next month again.") + + val multiDate = new MultiDateMatcher() + .setInputCols(Array("document")) + .setReadMonthFirst(false) + .setOutputCol("multi_date") + .setInputFormats(Array("dd/MM/yyyy")) + .setOutputFormat("dd/MM/yyyy") + .setAggressiveMatching(true) + + val pipeline = new Pipeline().setStages(Array(multiDate)) + + val annotated = pipeline.fit(data).transform(data) + val collectResult = annotated.select("multi_date").collect() + + collectResult.foreach { result => + val annotations = Annotation.getAnnotations(result, "multi_date") + assert(annotations.nonEmpty) + } + } + }