Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARKNLP-1058] Adding aggressiveMatching parameter #14365

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions python/sparknlp/annotator/matcher/date_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ class DateMatcherUtils(Params):
"Matched Strategy to searches relaxed dates",
typeConverter=TypeConverters.toString)

aggressiveMatching = Param(Params._dummy(),
"aggressiveMatching",
"Whether to aggressively attempt to find date matches, even in ambiguous or less common formats",
typeConverter=TypeConverters.toBoolean)

def setInputFormats(self, value):
"""Sets input formats patterns to match in the documents.

Expand Down Expand Up @@ -177,6 +182,16 @@ def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
"""
return self._set(relaxedFactoryStrategy=matchStrategy)

def setAggressiveMatching(self, value):
""" Sets whether to aggressively attempt to find date matches, even in ambiguous or less common formats

Parameters
----------
aggressiveMatching : Boolean
Whether to aggressively attempt to find date matches, even in ambiguous or less common formats
"""
return self._set(aggressiveMatching=value)


class DateMatcher(AnnotatorModel, DateMatcherUtils):
"""Matches standard date formats into a provided format
Expand Down
15 changes: 9 additions & 6 deletions src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcher.scala
Original file line number Diff line number Diff line change
Expand Up @@ -164,12 +164,15 @@ class DateMatcher(override val uid: String)
def inputFormatsAreDefined = !getInputFormats.sameElements(EMPTY_INIT_ARRAY)

val possibleDate: Option[MatchedDateTime] =
if (inputFormatsAreDefined)
runInputFormatsSearch(_text)
else
runDateExtractorChain(_text)

possibleDate.orElse(setTimeIfAny(possibleDate, _text))
if (inputFormatsAreDefined) runInputFormatsSearch(_text) else runDateExtractorChain(_text)

if (getAggressiveMatching) {
possibleDate
.orElse(runDateExtractorChain(_text))
.orElse(setTimeIfAny(possibleDate, _text))
} else {
possibleDate.orElse(setTimeIfAny(possibleDate, _text))
}
}

private def runDateExtractorChain(_text: String) = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,28 @@ trait DateMatcherUtils extends Params {
*/
def getRelaxedFactoryStrategy: String = $(relaxedFactoryStrategy)

/** Whether to aggressively attempt to find date matches, even in ambiguous or less common
* formats (Default: `false`)
*
* @group param
*/
val aggressiveMatching: BooleanParam = new BooleanParam(
this,
"aggressiveMatching",
"Whether to aggressively attempt to find date matches, even in ambiguous or less common formats")

/** To set aggressive matching Strategy
*
* @group param
*/
def setAggressiveMatching(value: Boolean): this.type = set(aggressiveMatching, value)

/** To get aggressive matching Strategy
*
* @group param
*/
def getAggressiveMatching: Boolean = $(aggressiveMatching)

setDefault(
inputFormats -> Array(""),
outputFormat -> "yyyy/MM/dd",
Expand All @@ -284,7 +306,8 @@ trait DateMatcherUtils extends Params {
readMonthFirst -> true,
defaultDayWhenMissing -> 1,
sourceLanguage -> "en",
relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString)
relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString,
aggressiveMatching -> false)

protected val formalFactoryInputFormats = new RuleFactory(MatchStrategy.MATCH_ALL)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,9 @@ class MultiDateMatcher(override val uid: String)
else
runDateExtractorChain(_text)

possibleDates
if (getAggressiveMatching && possibleDates.isEmpty) {
runDateExtractorChain(_text)
} else possibleDates
}

private def extractRelativeDateFuture(text: String): Seq[MatchedDateTime] = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorType.DATE
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DataBuilder}
import com.johnsnowlabs.tags.FastTest

import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.{Dataset, Row}

import org.scalatest.flatspec.AnyFlatSpec

import java.util.Calendar
Expand Down Expand Up @@ -347,4 +346,34 @@ class DateMatcherTestSpec extends AnyFlatSpec with DateMatcherBehaviors {

assert(results == expectedDates)
}

"a DataMatcher" should "make a more forceful or proactive approach in finding dates when aggressive match is set" in {

val data = DataBuilder.basicDataBuild(
"See you on next monday.",
"I was born at 01/03/98",
"She was born on 02/03/1966.",
"The project started yesterday and will finish next year.",
"She will graduate by July 2023.",
"She will visit doctor tomorrow and next month again.")

val multiDate = new DateMatcher()
.setInputCols(Array("document"))
.setReadMonthFirst(false)
.setOutputCol("date")
.setInputFormats(Array("dd/MM/yyyy"))
.setOutputFormat("dd/MM/yyyy")
.setAggressiveMatching(true)

val pipeline = new Pipeline().setStages(Array(multiDate))

val annotated = pipeline.fit(data).transform(data)
val collectResult = annotated.select("date").collect()

collectResult.foreach { result =>
val annotations = Annotation.getAnnotations(result, "date")
assert(annotations.nonEmpty)
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,33 @@ class MultiDateMatcherMultiLanguageTestSpec extends AnyFlatSpec with DateMatcher
assert(results.contains(getOneDayAgoDate()) && results.contains(getInTwoWeeksDate()))
}

"a DataMatcher" should "make a more forceful or proactive approach in finding dates when aggressive match is set" in {

val data = DataBuilder.basicDataBuild(
"See you on next monday.",
"I was born at 01/03/98",
"She was born on 02/03/1966.",
"The project started yesterday and will finish next year.",
"She will graduate by July 2023.",
"She will visit doctor tomorrow and next month again.")

val multiDate = new MultiDateMatcher()
.setInputCols(Array("document"))
.setReadMonthFirst(false)
.setOutputCol("multi_date")
.setInputFormats(Array("dd/MM/yyyy"))
.setOutputFormat("dd/MM/yyyy")
.setAggressiveMatching(true)

val pipeline = new Pipeline().setStages(Array(multiDate))

val annotated = pipeline.fit(data).transform(data)
val collectResult = annotated.select("multi_date").collect()

collectResult.foreach { result =>
val annotations = Annotation.getAnnotations(result, "multi_date")
assert(annotations.nonEmpty)
}
}

}
Loading