Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPARKNLP-832-MultiDateMatcher-doesn-t-return-multiple-dates #13783

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions python/sparknlp/annotator/matcher/date_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ class DateMatcherUtils(Params):
"source language for explicit translation",
typeConverter=TypeConverters.toString)

relaxedFactoryStrategy = Param(Params._dummy(),
"relaxedFactoryStrategy",
"Matched Strategy to searches relaxed dates",
typeConverter=TypeConverters.toString)

def setInputFormats(self, value):
"""Sets input formats patterns to match in the documents.

Expand Down Expand Up @@ -159,6 +164,19 @@ def setAnchorDateDay(self, value):
"""
return self._set(anchorDateDay=value)

def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
""" Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.

Not all of the date information needs to be included. For example
``"YYYY"`` is also a valid input.

Parameters
----------
matchStrategy : MatchStrategy
Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
"""
return self._set(relaxedFactoryStrategy=matchStrategy)


class DateMatcher(AnnotatorModel, DateMatcherUtils):
"""Matches standard date formats into a provided format
Expand Down
1 change: 1 addition & 0 deletions python/sparknlp/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
from sparknlp.common.storage import *
from sparknlp.common.utils import *
from sparknlp.common.annotator_type import *
from sparknlp.common.match_strategy import *
33 changes: 33 additions & 0 deletions python/sparknlp/common/match_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2017-2023 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Allowed strategies for RuleFactory applications regarding replacement"""


class MatchStrategy(object):
"""Object that contains constants for how for matched strategies used in RuleFactory.

Possible values are:

================================== ===============================================================================
Value Description
================================== ===============================================================================
``MatchStrategy.MATCH_ALL`` This strategy matches all occurrences of all rules in the given text.
``MatchStrategy.MATCH_FIRST`` This strategy matches only the first occurrence of each rule in the given text.
``MatchStrategy.MATCH_COMPLETE`` This strategy matches only the first occurrence of each rule in the given text.
================================== ===============================================================================
"""
MATCH_ALL = "MATCH_ALL"
MATCH_FIRST = "MATCH_FIRST"
MATCH_COMPLETE = "MATCH_COMPLETE"
50 changes: 50 additions & 0 deletions python/test/annotator/matcher/multi_date_matcher_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

import pytest

from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.functions import size
from test.util import SparkContextForTest


@pytest.mark.fast
class MultiDateMatcherTestSpec(unittest.TestCase):

def setUp(self):
text = """
Lease Period Monthly Installment of Base Rent.
January 1, 2021 –December 31, 2021 $20,304.85 .
January 1, 2022 –December 31, 2022 $20,914.00 .
"""
self.data = SparkContextForTest.spark.createDataFrame([[text]]).toDF("text")

def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
date_matcher = MultiDateMatcher() \
.setInputCols(["document"]) \
.setOutputCol("date") \
.setOutputFormat("yyyy/MM/dd") \
.setRelaxedFactoryStrategy(MatchStrategy.MATCH_ALL)

pipeline = Pipeline(stages=[document_assembler, date_matcher])
model = pipeline.fit(self.data)
result = model.transform(self.data)

actual_dates = result.select(size("date.result")).collect()[0][0]
self.assertEquals(actual_dates, 4)
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
import com.johnsnowlabs.nlp.util.io.MatchStrategy
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import org.apache.spark.ml.param._

import java.util.Calendar
Expand Down Expand Up @@ -249,6 +250,31 @@ trait DateMatcherUtils extends Params {
*/
def setSourceLanguage(value: String): this.type = set(sourceLanguage, value)

/** Matched strategy to search relaxed dates by ordered rules by more exhaustive to less
* Strategy
*
* @group param
*/
val relaxedFactoryStrategy: Param[String] =
new Param(this, "relaxedFactoryStrategy", "Matched Strategy to searches relaxed dates")

/** To set matched strategy to search relaxed dates by ordered rules by more exhaustive to less
* Strategy
*
* @group param
*/
def setRelaxedFactoryStrategy(
matchStrategy: MatchStrategy.Format = MatchStrategy.MATCH_FIRST): this.type = {
set(relaxedFactoryStrategy, matchStrategy.toString)
}

/** To get matched strategy to search relaxed dates by ordered rules by more exhaustive to less
* Strategy
*
* @group param
*/
def getRelaxedFactoryStrategy: String = $(relaxedFactoryStrategy)

setDefault(
inputFormats -> Array(""),
outputFormat -> "yyyy/MM/dd",
Expand All @@ -257,7 +283,8 @@ trait DateMatcherUtils extends Params {
anchorDateDay -> -1,
readMonthFirst -> true,
defaultDayWhenMissing -> 1,
sourceLanguage -> "en")
sourceLanguage -> "en",
relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString)

protected val formalFactoryInputFormats = new RuleFactory(MatchStrategy.MATCH_ALL)

Expand Down Expand Up @@ -322,11 +349,10 @@ trait DateMatcherUtils extends Params {
.addRule(formalDateAlt2, "formal date with year at beginning")
.addRule(formalDateShort, "formal date short version")

/** Searches relaxed dates by ordered rules by more exhaustive to less Strategy used is to match
* first only. any other matches discarded Auto completes short versions of months. Any two
* digit year is considered to be XX century
/** Searches relaxed dates by ordered rules by more exhaustive to less Strategy. Auto completes
* short versions of months. Any two digit year is considered to be XX century
*/
protected val relaxedFactory: RuleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
protected lazy val relaxedFactory: RuleFactory = new RuleFactory(getRelaxedFactoryStrategy)
.addRule(relaxedDayNumbered, "relaxed days")
.addRule(relaxedMonths.r, "relaxed months exclusive")
.addRule(relaxedYear, "relaxed year")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,14 @@ class MultiDateMatcher(override val uid: String)

private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = {
val possibleDates = relaxedFactory.findMatch(text)
val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch)
possibleDatesByIndexMatch.flatMap { case (_, possibleDates) =>
computePossibleDates(possibleDates)
}.toSeq
}

private def computePossibleDates(
possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = {
var dayMatch = $(defaultDayWhenMissing)
var monthMatch = defaultMonthWhenMissing
var yearMatch = defaultYearWhenMissing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT}
import com.johnsnowlabs.nlp._
import com.johnsnowlabs.nlp.serialization.ArrayFeature
import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MatchStrategy
import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RegexRule, RuleFactory, TransformStrategy}
import com.johnsnowlabs.nlp.util.io.MatchStrategy
import com.johnsnowlabs.nlp.util.regex.{RegexRule, RuleFactory, TransformStrategy}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.Identifiable

Expand Down Expand Up @@ -104,7 +104,7 @@ class RegexMatcherModel(override val uid: String)
def getExternalRules: Array[(String, String)] = $$(externalRules)

/** MATCH_ALL|MATCH_FIRST|MATCH_COMPLETE */
private def getFactoryStrategy: MatchStrategy = $(strategy) match {
private def getFactoryStrategy: MatchStrategy.Format = $(strategy) match {
case "MATCH_ALL" => MatchStrategy.MATCH_ALL
case "MATCH_FIRST" => MatchStrategy.MATCH_FIRST
case "MATCH_COMPLETE" => MatchStrategy.MATCH_COMPLETE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN}
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
import com.johnsnowlabs.nlp.util.io.{ExternalResource, MatchStrategy, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{BooleanParam, IntParam, Param, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic

import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticDictionaries._
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticSymbols._
import com.johnsnowlabs.nlp.util.regex.MatchStrategy._
import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.regex.TransformStrategy._
import com.johnsnowlabs.nlp.util.regex.{RegexRule, RuleFactory}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.nlp.annotators.sbd.pragmatic

import com.johnsnowlabs.nlp.annotators.common.Sentence
import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.util.regex.TransformStrategy.{
REPLACE_ALL_WITH_SYMBOL,
Expand Down
37 changes: 37 additions & 0 deletions src/main/scala/com/johnsnowlabs/nlp/util/io/MatchStrategy.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright 2017-2023 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.johnsnowlabs.nlp.util.io

import com.johnsnowlabs.nlp.util.regex.RuleFactory

/** Allowed strategies for [[RuleFactory]] applications regarding replacement */
object MatchStrategy extends Enumeration {

implicit def str2frmt(v: String): Format = {
v.toUpperCase match {
case "MATCH_ALL" => MATCH_ALL
case "MATCH_FIRST" => MATCH_FIRST
case "MATCH_COMPLETE" => MATCH_COMPLETE
case _ =>
throw new MatchError(
s"Invalid MatchStrategy. Must be either of ${this.values.mkString("|")}")
}
}

type Format = Value
val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value
}
17 changes: 7 additions & 10 deletions src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.johnsnowlabs.nlp.util.regex

import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.RuleSymbols
import com.johnsnowlabs.nlp.util.io.MatchStrategy

import scala.util.matching.Regex

Expand All @@ -27,7 +28,7 @@ import scala.util.matching.Regex
* How to decide when replacing or transforming content with Regex
*/
class RuleFactory(
matchStrategy: MatchStrategy.MatchStrategy,
matchStrategy: MatchStrategy.Format,
transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM)
extends RuleSymbols
with Serializable {
Expand Down Expand Up @@ -69,7 +70,9 @@ class RuleFactory(
matchStrategy match {
case MATCH_ALL =>
rules.flatMap(rule =>
rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
rule.regex.findAllMatchIn(text).zipWithIndex.map { case (currentMatch, index) =>
RuleMatch(currentMatch, rule.identifier, index)
})
case MATCH_FIRST =>
rules.flatMap(rule =>
rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
Expand Down Expand Up @@ -224,7 +227,7 @@ object RuleFactory {
/** Specific partial constructor for [[RuleFactory]] where MatchStrategy might change on runtime
*/
def lateMatching(transformStrategy: TransformStrategy.TransformStrategy)(
matchStrategy: MatchStrategy.MatchStrategy): RuleFactory =
matchStrategy: MatchStrategy.Format): RuleFactory =
new RuleFactory(matchStrategy, transformStrategy)

/** Internal representation of a regex match
Expand All @@ -234,7 +237,7 @@ object RuleFactory {
* @param identifier
* user provided identification of a rule
*/
case class RuleMatch(content: Regex.Match, identifier: String)
case class RuleMatch(content: Regex.Match, identifier: String, indexMatch: Int = -1)
}

/** Allowed strategies for [[RuleFactory]] applications regarding replacement */
Expand All @@ -244,9 +247,3 @@ object TransformStrategy extends Enumeration {
REPLACE_WITH_SYMBOL_AND_BREAK, PROTECT_FROM_BREAK, BREAK_AND_PROTECT_FROM_BREAK,
REPLACE_EACH_WITH_SYMBOL, REPLACE_EACH_WITH_SYMBOL_AND_BREAK = Value
}

/** Allowed strategies for [[RuleFactory]] applications regarding matching */
object MatchStrategy extends Enumeration {
type MatchStrategy = Value
val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.AnnotatorType.DATE
import com.johnsnowlabs.nlp.util.io.MatchStrategy
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DataBuilder}
import com.johnsnowlabs.tags.FastTest
import org.apache.spark.sql.{Dataset, Row}
Expand Down Expand Up @@ -307,4 +308,31 @@ class MultiDateMatcherTestSpec extends AnyFlatSpec with DateMatcherBehaviors {

assert(results == expectedDates)
}

"a MultiDateMatcher" should "correctly find all possible dates in a text" taggedAs FastTest in {

val data: Dataset[Row] = DataBuilder.multipleDataBuild(Array("""
Lease Period Monthly Installment of Base Rent
January 1, 2021 –December 31, 2021 $20,304.85*
January 1, 2022 –December 31, 2022 $20,914.00
"""))

val dateMatcher = new MultiDateMatcher()
.setInputCols(Array("document"))
.setOutputCol("date")
.setOutputFormat("yyyy/MM/dd")
.setRelaxedFactoryStrategy(MatchStrategy.MATCH_ALL)
.transform(data)

val results = Annotation.collect(dateMatcher, "date").flatten.toSeq.sortBy(_.end)

val expectedDates = Seq(
Annotation(DATE, 67, 81, "2021/01/01", Map("sentence" -> "0")),
Annotation(DATE, 84, 100, "2021/12/31", Map("sentence" -> "0")),
Annotation(DATE, 103, 138, "2022/01/20", Map("sentence" -> "0")),
Annotation(DATE, 132, 157, "2022/12/01", Map("sentence" -> "0")))

assert(results == expectedDates)
}

}