Skip to content

Commit

Permalink
[SPARKNLP-1058] Adding aggressiveMatching parameter (#14365)
Browse files Browse the repository at this point in the history
  • Loading branch information
danilojsl authored Aug 28, 2024
1 parent 9227f6d commit b785fa3
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 10 deletions.
15 changes: 15 additions & 0 deletions python/sparknlp/annotator/matcher/date_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ class DateMatcherUtils(Params):
"Matched Strategy to searches relaxed dates",
typeConverter=TypeConverters.toString)

aggressiveMatching = Param(Params._dummy(),
"aggressiveMatching",
"Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
typeConverter=TypeConverters.toBoolean)

def setInputFormats(self, value):
"""Sets input formats patterns to match in the documents.
Expand Down Expand Up @@ -177,6 +182,16 @@ def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
"""
return self._set(relaxedFactoryStrategy=matchStrategy)

def setAggressiveMatching(self, value):
""" Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats
Parameters
----------
aggressiveMatching : Boolean
Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
"""
return self._set(aggressiveMatching=value)


class DateMatcher(AnnotatorModel, DateMatcherUtils):
"""Matches standard date formats into a provided format
Expand Down
15 changes: 9 additions & 6 deletions src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,15 @@ class DateMatcher(override val uid: String)
def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY)

val possibleDate: Option[MatchedDateTime] =
if (inputFormatsAreDefined)
runInputFormatsSearch(_text)
else
runDateExtractorChain(_text)

possibleDate.orElse(setTimeIfAny(possibleDate, _text))
if (inputFormatsAreDefined) runInputFormatsSearch(_text) else runDateExtractorChain(_text)

if (getAggressiveMatching) {
possibleDate
.orElse(runDateExtractorChain(_text))
.orElse(setTimeIfAny(possibleDate, _text))
} else {
possibleDate.orElse(setTimeIfAny(possibleDate, _text))
}
}

private def runDateExtractorChain(_text: String) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,28 @@ trait DateMatcherUtils extends Params {
*/
def getRelaxedFactoryStrategy: String = $(relaxedFactoryStrategy)

/** Whether to aggressively attempt to find date matches, even in ambiguous or less common
* formats (Default: `false`)
*
* @group param
*/
val aggressiveMatching: BooleanParam = new BooleanParam(
this,
"aggressiveMatching",
"Whether to aggressively attempt to find date matches, even in ambiguous or less common formats")

/** To set aggressive matching Strategy
*
* @group param
*/
def setAggressiveMatching(value: Boolean): this.type = set(aggressiveMatching, value)

/** To get aggressive matching Strategy
*
* @group param
*/
def getAggressiveMatching: Boolean = $(aggressiveMatching)

setDefault(
inputFormats -> Array(""),
outputFormat -> "yyyy/MM/dd",
Expand All @@ -284,7 +306,8 @@ trait DateMatcherUtils extends Params {
readMonthFirst -> true,
defaultDayWhenMissing -> 1,
sourceLanguage -> "en",
relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString)
relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString,
aggressiveMatching -> false)

protected val formalFactoryInputFormats = new RuleFactory(MatchStrategy.MATCH_ALL)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,9 @@ class MultiDateMatcher(override val uid: String)
else
runDateExtractorChain(_text)

possibleDates
if (getAggressiveMatching && possibleDates.isEmpty) {
runDateExtractorChain(_text)
} else possibleDates
}

private def extractRelativeDateFuture(text: String): Seq[MatchedDateTime] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorType.DATE
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DataBuilder}
import com.johnsnowlabs.tags.FastTest

import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.{Dataset, Row}

import org.scalatest.flatspec.AnyFlatSpec

import java.util.Calendar
Expand Down Expand Up @@ -347,4 +346,34 @@ class DateMatcherTestSpec extends AnyFlatSpec with DateMatcherBehaviors {

assert(results == expectedDates)
}

"a DataMatcher" should "make a more forceful or proactive approach in finding dates when aggressive match is set" in {

val data = DataBuilder.basicDataBuild(
"See you on next monday.",
"I was born at 01/03/98",
"She was born on 02/03/1966.",
"The project started yesterday and will finish next year.",
"She will graduate by July 2023.",
"She will visit doctor tomorrow and next month again.")

val multiDate = new DateMatcher()
.setInputCols(Array("document"))
.setReadMonthFirst(false)
.setOutputCol("date")
.setInputFormats(Array("dd/MM/yyyy"))
.setOutputFormat("dd/MM/yyyy")
.setAggressiveMatching(true)

val pipeline = new Pipeline().setStages(Array(multiDate))

val annotated = pipeline.fit(data).transform(data)
val collectResult = annotated.select("date").collect()

collectResult.foreach { result =>
val annotations = Annotation.getAnnotations(result, "date")
assert(annotations.nonEmpty)
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,33 @@ class MultiDateMatcherMultiLanguageTestSpec extends AnyFlatSpec with DateMatcher
assert(results.contains(getOneDayAgoDate()) && results.contains(getInTwoWeeksDate()))
}

"a DataMatcher" should "make a more forceful or proactive approach in finding dates when aggressive match is set" in {

val data = DataBuilder.basicDataBuild(
"See you on next monday.",
"I was born at 01/03/98",
"She was born on 02/03/1966.",
"The project started yesterday and will finish next year.",
"She will graduate by July 2023.",
"She will visit doctor tomorrow and next month again.")

val multiDate = new MultiDateMatcher()
.setInputCols(Array("document"))
.setReadMonthFirst(false)
.setOutputCol("multi_date")
.setInputFormats(Array("dd/MM/yyyy"))
.setOutputFormat("dd/MM/yyyy")
.setAggressiveMatching(true)

val pipeline = new Pipeline().setStages(Array(multiDate))

val annotated = pipeline.fit(data).transform(data)
val collectResult = annotated.select("multi_date").collect()

collectResult.foreach { result =>
val annotations = Annotation.getAnnotations(result, "multi_date")
assert(annotations.nonEmpty)
}
}

}

0 comments on commit b785fa3

Please sign in to comment.