Skip to content

Commit

Permalink
Merge pull request #13783 from JohnSnowLabs/bug/SPARKNLP-832-MultiDat…
Browse files Browse the repository at this point in the history
…eMatcher-doesn-t-return-multiple-dates

SPARKNLP-832-MultiDateMatcher-doesn-t-return-multiple-dates
  • Loading branch information
maziyarpanahi authored May 10, 2023
2 parents 4968a8f + a3c3061 commit 2153a06
Show file tree
Hide file tree
Showing 13 changed files with 221 additions and 23 deletions.
18 changes: 18 additions & 0 deletions python/sparknlp/annotator/matcher/date_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ class DateMatcherUtils(Params):
"source language for explicit translation",
typeConverter=TypeConverters.toString)

relaxedFactoryStrategy = Param(Params._dummy(),
"relaxedFactoryStrategy",
"Matched Strategy to searches relaxed dates",
typeConverter=TypeConverters.toString)

def setInputFormats(self, value):
"""Sets input formats patterns to match in the documents.
Expand Down Expand Up @@ -159,6 +164,19 @@ def setAnchorDateDay(self, value):
"""
return self._set(anchorDateDay=value)

def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST):
""" Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy.
Not all of the date information needs to be included. For example
``"YYYY"`` is also a valid input.
Parameters
----------
matchStrategy : MatchStrategy
Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy
"""
return self._set(relaxedFactoryStrategy=matchStrategy)


class DateMatcher(AnnotatorModel, DateMatcherUtils):
"""Matches standard date formats into a provided format
Expand Down
1 change: 1 addition & 0 deletions python/sparknlp/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
from sparknlp.common.storage import *
from sparknlp.common.utils import *
from sparknlp.common.annotator_type import *
from sparknlp.common.match_strategy import *
33 changes: 33 additions & 0 deletions python/sparknlp/common/match_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2017-2023 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Allowed strategies for RuleFactory applications regarding replacement"""


class MatchStrategy(object):
"""Object that contains constants for how for matched strategies used in RuleFactory.
Possible values are:
================================== ===============================================================================
Value Description
================================== ===============================================================================
``MatchStrategy.MATCH_ALL`` This strategy matches all occurrences of all rules in the given text.
``MatchStrategy.MATCH_FIRST`` This strategy matches only the first occurrence of each rule in the given text.
``MatchStrategy.MATCH_COMPLETE`` This strategy matches only the first occurrence of each rule in the given text.
================================== ===============================================================================
"""
MATCH_ALL = "MATCH_ALL"
MATCH_FIRST = "MATCH_FIRST"
MATCH_COMPLETE = "MATCH_COMPLETE"
50 changes: 50 additions & 0 deletions python/test/annotator/matcher/multi_date_matcher_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2017-2022 John Snow Labs
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest

import pytest

from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.functions import size
from test.util import SparkContextForTest


@pytest.mark.fast
class MultiDateMatcherTestSpec(unittest.TestCase):

def setUp(self):
text = """
Lease Period Monthly Installment of Base Rent.
January 1, 2021 –December 31, 2021 $20,304.85 .
January 1, 2022 –December 31, 2022 $20,914.00 .
"""
self.data = SparkContextForTest.spark.createDataFrame([[text]]).toDF("text")

def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
date_matcher = MultiDateMatcher() \
.setInputCols(["document"]) \
.setOutputCol("date") \
.setOutputFormat("yyyy/MM/dd") \
.setRelaxedFactoryStrategy(MatchStrategy.MATCH_ALL)

pipeline = Pipeline(stages=[document_assembler, date_matcher])
model = pipeline.fit(self.data)
result = model.transform(self.data)

actual_dates = result.select(size("date.result")).collect()[0][0]
self.assertEquals(actual_dates, 4)
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
import com.johnsnowlabs.nlp.util.io.MatchStrategy
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import org.apache.spark.ml.param._

import java.util.Calendar
Expand Down Expand Up @@ -249,6 +250,31 @@ trait DateMatcherUtils extends Params {
*/
def setSourceLanguage(value: String): this.type = set(sourceLanguage, value)

/** Matched strategy to search relaxed dates by ordered rules by more exhaustive to less
* Strategy
*
* @group param
*/
val relaxedFactoryStrategy: Param[String] =
new Param(this, "relaxedFactoryStrategy", "Matched Strategy to searches relaxed dates")

/** To set matched strategy to search relaxed dates by ordered rules by more exhaustive to less
* Strategy
*
* @group param
*/
def setRelaxedFactoryStrategy(
matchStrategy: MatchStrategy.Format = MatchStrategy.MATCH_FIRST): this.type = {
set(relaxedFactoryStrategy, matchStrategy.toString)
}

/** To get matched strategy to search relaxed dates by ordered rules by more exhaustive to less
* Strategy
*
* @group param
*/
def getRelaxedFactoryStrategy: String = $(relaxedFactoryStrategy)

setDefault(
inputFormats -> Array(""),
outputFormat -> "yyyy/MM/dd",
Expand All @@ -257,7 +283,8 @@ trait DateMatcherUtils extends Params {
anchorDateDay -> -1,
readMonthFirst -> true,
defaultDayWhenMissing -> 1,
sourceLanguage -> "en")
sourceLanguage -> "en",
relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString)

protected val formalFactoryInputFormats = new RuleFactory(MatchStrategy.MATCH_ALL)

Expand Down Expand Up @@ -322,11 +349,10 @@ trait DateMatcherUtils extends Params {
.addRule(formalDateAlt2, "formal date with year at beginning")
.addRule(formalDateShort, "formal date short version")

/** Searches relaxed dates by ordered rules by more exhaustive to less Strategy used is to match
* first only. any other matches discarded Auto completes short versions of months. Any two
* digit year is considered to be XX century
/** Searches relaxed dates by ordered rules by more exhaustive to less Strategy. Auto completes
* short versions of months. Any two digit year is considered to be XX century
*/
protected val relaxedFactory: RuleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST)
protected lazy val relaxedFactory: RuleFactory = new RuleFactory(getRelaxedFactoryStrategy)
.addRule(relaxedDayNumbered, "relaxed days")
.addRule(relaxedMonths.r, "relaxed months exclusive")
.addRule(relaxedYear, "relaxed year")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,14 @@ class MultiDateMatcher(override val uid: String)

private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = {
val possibleDates = relaxedFactory.findMatch(text)
val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch)
possibleDatesByIndexMatch.flatMap { case (_, possibleDates) =>
computePossibleDates(possibleDates)
}.toSeq
}

private def computePossibleDates(
possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = {
var dayMatch = $(defaultDayWhenMissing)
var monthMatch = defaultMonthWhenMissing
var yearMatch = defaultYearWhenMissing
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT}
import com.johnsnowlabs.nlp._
import com.johnsnowlabs.nlp.serialization.ArrayFeature
import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MatchStrategy
import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RegexRule, RuleFactory, TransformStrategy}
import com.johnsnowlabs.nlp.util.io.MatchStrategy
import com.johnsnowlabs.nlp.util.regex.{RegexRule, RuleFactory, TransformStrategy}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.util.Identifiable

Expand Down Expand Up @@ -104,7 +104,7 @@ class RegexMatcherModel(override val uid: String)
def getExternalRules: Array[(String, String)] = $$(externalRules)

/** MATCH_ALL|MATCH_FIRST|MATCH_COMPLETE */
private def getFactoryStrategy: MatchStrategy = $(strategy) match {
private def getFactoryStrategy: MatchStrategy.Format = $(strategy) match {
case "MATCH_ALL" => MatchStrategy.MATCH_ALL
case "MATCH_FIRST" => MatchStrategy.MATCH_FIRST
case "MATCH_COMPLETE" => MatchStrategy.MATCH_COMPLETE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ package com.johnsnowlabs.nlp.annotators
import com.johnsnowlabs.nlp.AnnotatorApproach
import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN}
import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam
import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory}
import com.johnsnowlabs.nlp.util.io.{ExternalResource, MatchStrategy, ReadAs, ResourceHelper}
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.param.{BooleanParam, IntParam, Param, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic

import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticDictionaries._
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticSymbols._
import com.johnsnowlabs.nlp.util.regex.MatchStrategy._
import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.regex.TransformStrategy._
import com.johnsnowlabs.nlp.util.regex.{RegexRule, RuleFactory}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.johnsnowlabs.nlp.annotators.sbd.pragmatic

import com.johnsnowlabs.nlp.annotators.common.Sentence
import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL
import com.johnsnowlabs.nlp.util.regex.RuleFactory
import com.johnsnowlabs.nlp.util.regex.TransformStrategy.{
REPLACE_ALL_WITH_SYMBOL,
Expand Down
37 changes: 37 additions & 0 deletions src/main/scala/com/johnsnowlabs/nlp/util/io/MatchStrategy.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright 2017-2023 John Snow Labs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.johnsnowlabs.nlp.util.io

import com.johnsnowlabs.nlp.util.regex.RuleFactory

/** Allowed strategies for [[RuleFactory]] applications regarding replacement */
object MatchStrategy extends Enumeration {

implicit def str2frmt(v: String): Format = {
v.toUpperCase match {
case "MATCH_ALL" => MATCH_ALL
case "MATCH_FIRST" => MATCH_FIRST
case "MATCH_COMPLETE" => MATCH_COMPLETE
case _ =>
throw new MatchError(
s"Invalid MatchStrategy. Must be either of ${this.values.mkString("|")}")
}
}

type Format = Value
val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value
}
17 changes: 7 additions & 10 deletions src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.johnsnowlabs.nlp.util.regex

import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.RuleSymbols
import com.johnsnowlabs.nlp.util.io.MatchStrategy

import scala.util.matching.Regex

Expand All @@ -27,7 +28,7 @@ import scala.util.matching.Regex
* How to decide when replacing or transforming content with Regex
*/
class RuleFactory(
matchStrategy: MatchStrategy.MatchStrategy,
matchStrategy: MatchStrategy.Format,
transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM)
extends RuleSymbols
with Serializable {
Expand Down Expand Up @@ -69,7 +70,9 @@ class RuleFactory(
matchStrategy match {
case MATCH_ALL =>
rules.flatMap(rule =>
rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
rule.regex.findAllMatchIn(text).zipWithIndex.map { case (currentMatch, index) =>
RuleMatch(currentMatch, rule.identifier, index)
})
case MATCH_FIRST =>
rules.flatMap(rule =>
rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier)))
Expand Down Expand Up @@ -224,7 +227,7 @@ object RuleFactory {
/** Specific partial constructor for [[RuleFactory]] where MatchStrategy might change on runtime
*/
def lateMatching(transformStrategy: TransformStrategy.TransformStrategy)(
matchStrategy: MatchStrategy.MatchStrategy): RuleFactory =
matchStrategy: MatchStrategy.Format): RuleFactory =
new RuleFactory(matchStrategy, transformStrategy)

/** Internal representation of a regex match
Expand All @@ -234,7 +237,7 @@ object RuleFactory {
* @param identifier
* user provided identification of a rule
*/
case class RuleMatch(content: Regex.Match, identifier: String)
case class RuleMatch(content: Regex.Match, identifier: String, indexMatch: Int = -1)
}

/** Allowed strategies for [[RuleFactory]] applications regarding replacement */
Expand All @@ -244,9 +247,3 @@ object TransformStrategy extends Enumeration {
REPLACE_WITH_SYMBOL_AND_BREAK, PROTECT_FROM_BREAK, BREAK_AND_PROTECT_FROM_BREAK,
REPLACE_EACH_WITH_SYMBOL, REPLACE_EACH_WITH_SYMBOL_AND_BREAK = Value
}

/** Allowed strategies for [[RuleFactory]] applications regarding matching */
object MatchStrategy extends Enumeration {
type MatchStrategy = Value
val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.AnnotatorType.DATE
import com.johnsnowlabs.nlp.util.io.MatchStrategy
import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DataBuilder}
import com.johnsnowlabs.tags.FastTest
import org.apache.spark.sql.{Dataset, Row}
Expand Down Expand Up @@ -307,4 +308,31 @@ class MultiDateMatcherTestSpec extends AnyFlatSpec with DateMatcherBehaviors {

assert(results == expectedDates)
}

"a MultiDateMatcher" should "correctly find all possible dates in a text" taggedAs FastTest in {

val data: Dataset[Row] = DataBuilder.multipleDataBuild(Array("""
Lease Period Monthly Installment of Base Rent
January 1, 2021 –December 31, 2021 $20,304.85*
January 1, 2022 –December 31, 2022 $20,914.00
"""))

val dateMatcher = new MultiDateMatcher()
.setInputCols(Array("document"))
.setOutputCol("date")
.setOutputFormat("yyyy/MM/dd")
.setRelaxedFactoryStrategy(MatchStrategy.MATCH_ALL)
.transform(data)

val results = Annotation.collect(dateMatcher, "date").flatten.toSeq.sortBy(_.end)

val expectedDates = Seq(
Annotation(DATE, 67, 81, "2021/01/01", Map("sentence" -> "0")),
Annotation(DATE, 84, 100, "2021/12/31", Map("sentence" -> "0")),
Annotation(DATE, 103, 138, "2022/01/20", Map("sentence" -> "0")),
Annotation(DATE, 132, 157, "2022/12/01", Map("sentence" -> "0")))

assert(results == expectedDates)
}

}

0 comments on commit 2153a06

Please sign in to comment.