Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make lowercasing by Normalizer optional #84

Merged
merged 3 commits into from
Jan 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/components.html
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ <h4 id="Normalizer" class="section-block">3. Normalizer: Text cleaning</h4>
<ul>
<li>
setPattern(pattern): Regular expression for normalization, defaults [^A-Za-z]
setLowercase(value): lowercase tokens, default true
</li>
</ul>
<b>Example:</b><br>
Expand Down
6 changes: 6 additions & 0 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ class Normalizer(AnnotatorTransformer):
"normalization regex pattern which match will be replaced with a space",
typeConverter=TypeConverters.toString)

lowercase = Param(Params._dummy(),
"lowercase",
"whether to convert strings to lowercase")

@keyword_only
def __init__(self):
super(Normalizer, self).__init__()
Expand All @@ -123,6 +127,8 @@ def __init__(self):
def setPattern(self, value):
return self._set(pattern=value)

def setLowercase(self, value):
return self._set(lowercase=value)

class RegexMatcher(AnnotatorTransformer):

Expand Down
20 changes: 20 additions & 0 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,26 @@ def runTest(self):
lemmatizer.transform(tokenized).show()


class NormalizerTestSpec(unittest.TestCase):

def setUp(self):
self.data = SparkContextForTest.data

def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = RegexTokenizer() \
.setOutputCol("token")
lemmatizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized_token") \
.setLowercase(False)
assembled = document_assembler.transform(self.data)
tokenized = tokenizer.transform(assembled)
lemmatizer.transform(tokenized).show()


class DateMatcherTestSpec(unittest.TestCase):

def setUp(self):
Expand Down
15 changes: 12 additions & 3 deletions src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.param.{BooleanParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

/**
Expand All @@ -17,20 +17,29 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space")
val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")

setDefault(pattern, "[^a-zA-Z]")
setDefault(lowercase, true)

def getPattern: String = $(pattern)

def setPattern(value: String): this.type = set(pattern, value)

def getLowercase: Boolean = $(lowercase)

def setLowercase(value: Boolean): this.type = set(lowercase, value)

def this() = this(Identifiable.randomUID("NORMALIZER"))

/** ToDo: Review implementation, Current implementation generates spaces between non-words, potentially breaking tokens*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.map { token =>
val nToken = token.result
.toLowerCase
val cased =
if ($(lowercase)) token.result.toLowerCase
else token.result

val nToken = cased
.replaceAll($(pattern), "")
.trim
Annotation(
Expand Down
8 changes: 8 additions & 0 deletions src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
normalizer.transform(withTokenizer(dataset))
}

def withCaseSensitiveNormalizer(dataset: Dataset[Row]): Dataset[Row] = {
val normalizer = new Normalizer()
.setInputCols(Array("token"))
.setOutputCol("normalized")
.setLowercase(false)
normalizer.transform(withTokenizer(dataset))
}

def withFullLemmatizer(dataset: Dataset[Row]): Dataset[Row] = {
val lemmatizer = new Lemmatizer()
.setInputCols(Array("token"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ trait NormalizerBehaviors { this: FlatSpec =>
AnnotatorBuilder.withFullNormalizer(dataset)
.collect().foreach {
row =>
row.getSeq[Row](3)
row.getSeq[Row](4)
.map(Annotation(_))
.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
Expand All @@ -21,4 +21,25 @@ trait NormalizerBehaviors { this: FlatSpec =>
}
}
}

def lowercasingNormalizerPipeline(dataset: => Dataset[Row]) {
"A case-sensitive Normalizer Annotator" should "successfully transform data" in {
AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
.collect().foreach {
row =>
val tokens = row.getSeq[Row](3).map(Annotation(_))
val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
normalizedAnnotations.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
assert(stem.result.nonEmpty, "Annotation result exists")
case _ =>
}

normalizedAnnotations.zip(tokens).foreach {
case (stem: Annotation, token: Annotation) =>
assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ class NormalizerTestSpec extends FlatSpec with NormalizerBehaviors {
val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)

"A full Normalizer pipeline with latin content" should behave like fullNormalizerPipeline(latinBodyData)

"A Normalizer pipeline with latin content and disabled lowercasing" should behave like lowercasingNormalizerPipeline(latinBodyData)
}