From 132072fe5c611e110adff9e588273a9057068060 Mon Sep 17 00:00:00 2001 From: Ivan Vankov Date: Thu, 27 Apr 2023 12:19:54 +0300 Subject: [PATCH 01/32] limit max col rank to 255 --- .../com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala index d6aba2e78b1fb9..32fd750ff36dc2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala @@ -187,6 +187,7 @@ class TapasEncoder( protected val MAX_YEAR = 2120 protected val MIN_NUMBER_OF_ROWS_WITH_VALUES_PROPORTION = 0.5f + protected val MAX_COLUMN_RANK = 255 protected val ORDINAL_SUFFIXES: Array[String] = Array("st", "nd", "rd", "th") protected val NUMBER_WORDS: Array[String] = Array( @@ -525,9 +526,9 @@ class TapasEncoder( columnIds = setMaxSentenceLimit(emptyTokenTypes ++ columnIds ++ padding), rowIds = setMaxSentenceLimit(emptyTokenTypes ++ rowIds ++ padding), prevLabels = setMaxSentenceLimit(emptyTokenTypes ++ prevLabels ++ padding), - columnRanks = setMaxSentenceLimit(emptyTokenTypes ++ columnRanks ++ padding), + columnRanks = setMaxSentenceLimit(emptyTokenTypes ++ columnRanks.map(x => scala.math.min(x, MAX_COLUMN_RANK)) ++ padding), invertedColumnRanks = - setMaxSentenceLimit(emptyTokenTypes ++ invertedColumnRanks ++ padding), + setMaxSentenceLimit(emptyTokenTypes ++ invertedColumnRanks.map(x => scala.math.min(x, MAX_COLUMN_RANK)) ++ padding), numericRelations = setMaxSentenceLimit(emptyTokenTypes ++ numericRelations ++ padding)) } From 61301855d8e1b5dea94b882c88c0368194a01d5a Mon Sep 17 00:00:00 2001 From: Ivan Vankov Date: Thu, 27 Apr 2023 13:19:19 +0300 Subject: [PATCH 02/32] prettify --- .../johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala index 32fd750ff36dc2..d178bf11f06313 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tapas/TapasEncoder.scala @@ -526,9 +526,10 @@ class TapasEncoder( columnIds = setMaxSentenceLimit(emptyTokenTypes ++ columnIds ++ padding), rowIds = setMaxSentenceLimit(emptyTokenTypes ++ rowIds ++ padding), prevLabels = setMaxSentenceLimit(emptyTokenTypes ++ prevLabels ++ padding), - columnRanks = setMaxSentenceLimit(emptyTokenTypes ++ columnRanks.map(x => scala.math.min(x, MAX_COLUMN_RANK)) ++ padding), - invertedColumnRanks = - setMaxSentenceLimit(emptyTokenTypes ++ invertedColumnRanks.map(x => scala.math.min(x, MAX_COLUMN_RANK)) ++ padding), + columnRanks = setMaxSentenceLimit( + emptyTokenTypes ++ columnRanks.map(x => scala.math.min(x, MAX_COLUMN_RANK)) ++ padding), + invertedColumnRanks = setMaxSentenceLimit(emptyTokenTypes ++ invertedColumnRanks.map(x => + scala.math.min(x, MAX_COLUMN_RANK)) ++ padding), numericRelations = setMaxSentenceLimit(emptyTokenTypes ++ numericRelations ++ padding)) } From a56ad800561f18b1d502c072d3382e3ee2fee061 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Thu, 27 Apr 2023 08:36:00 -0500 Subject: [PATCH 03/32] SPARKNLP-819 Adding changes to make spark-nlp 3.4.0 default version --- project/Dependencies.scala | 15 ++++++++++----- .../sentencepiece/LoadSentencepiece.scala | 2 +- .../nlp/annotators/MultiDateMatcher.scala | 2 +- .../nlp/annotators/ner/dl/LoadsContrib.scala | 2 +- .../nlp/annotators/ner/dl/NerDLApproach.scala | 2 +- .../annotators/tokenizer/bpe/BpeTokenizer.scala | 2 +- .../normalizer/MosesPunctNormalizer.scala | 2 +- .../com/johnsnowlabs/util/CoNLLGenerator.scala | 3 ++- .../nlp/annotators/SparkSessionTest.scala | 2 ++ .../spell/norvig/NorvigSweetingBehaviors.scala | 2 +- .../symmetric/SymmetricDeleteBehaviors.scala | 2 +- .../nlp/util/CoNLLGeneratorTestSpec.scala | 2 +- 12 files changed, 23 insertions(+), 15 deletions(-) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 26607629fcbd47..2956315905c7f9 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -4,11 +4,12 @@ object Dependencies { /** ------- Spark version start ------- */ /* default spark version to base the APIS */ - val spark33Ver = "3.3.1" + val spark34Ver = "3.4.0" /* only used in unit tests */ val spark30Ver = "3.0.3" val spark31Ver = "3.1.3" val spark32Ver = "3.2.3" + val spark33Ver = "3.3.1" /* required for different hardware */ val is_gpu: String = System.getProperty("is_gpu", "false") @@ -20,9 +21,10 @@ object Dependencies { val is_spark30: String = System.getProperty("is_spark30", "false") val is_spark31: String = System.getProperty("is_spark31", "false") val is_spark32: String = System.getProperty("is_spark32", "false") - val is_spark33: String = System.getProperty("is_spark33", "true") + val is_spark33: String = System.getProperty("is_spark33", "false") + val is_spark34: String = System.getProperty("is_spark33", "true") - val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33) + val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34) /** ------- Spark version end ------- */ @@ -43,16 +45,19 @@ object Dependencies { is_spark30: String, is_spark31: String, is_spark32: String, - is_spark33: String): String = { + is_spark33: String, + is_spark34: String): String = { if (is_spark30.equals("true")) { spark30Ver } else if (is_spark31.equals("true")) { spark31Ver } else if (is_spark32.equals("true")) { spark32Ver + } else if (is_spark32.equals("true")) { + spark33Ver } else { /* default spark version */ - spark33Ver + spark34Ver } } diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala index 815bb126449a02..4e089d7edecf3c 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sentencepiece/LoadSentencepiece.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.ml.tensorflow.sentencepiece import com.johnsnowlabs.nlp.util.io.ResourceHelper -import org.apache.commons.lang.SystemUtils +import org.apache.commons.lang3.SystemUtils import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession import org.tensorflow.TensorFlow diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala index a2c31e0ddb82cb..53561604988481 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.util.regex.RuleFactory import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, HasSimpleAnnotate} -import org.apache.commons.lang.time.DateUtils +import org.apache.commons.lang3.time.DateUtils import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import java.text.SimpleDateFormat diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala index 702e0dd0e9a991..af5e16fd751a11 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/LoadsContrib.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.nlp.annotators.ner.dl import com.johnsnowlabs.nlp.util.io.ResourceHelper -import org.apache.commons.lang.SystemUtils +import org.apache.commons.lang3.SystemUtils import org.apache.spark.SparkFiles import org.apache.spark.sql.SparkSession import org.tensorflow.TensorFlow diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala index 93b2ca2325a318..3dc6f1012656f5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLApproach.scala @@ -27,7 +27,7 @@ import com.johnsnowlabs.nlp.util.io.{OutputHelper, ResourceHelper} import com.johnsnowlabs.nlp.{AnnotatorApproach, AnnotatorType, ParamsAndFeaturesWritable} import com.johnsnowlabs.storage.HasStorageRef import org.apache.commons.io.IOUtils -import org.apache.commons.lang.SystemUtils +import org.apache.commons.lang3.SystemUtils import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala index c777d12f475a65..42d78d27687891 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.nlp.annotators.tokenizer.bpe import com.johnsnowlabs.nlp.annotators.common.{IndexedToken, Sentence, TokenPiece} -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import scala.collection.mutable import scala.collection.mutable.ListBuffer diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala index 38ee5e8fb6df34..27d8df3158ed05 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/normalizer/MosesPunctNormalizer.scala @@ -16,7 +16,7 @@ package com.johnsnowlabs.nlp.annotators.tokenizer.normalizer -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import scala.util.matching.Regex diff --git a/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala b/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala index 48055f8db50668..188d954a6c8572 100644 --- a/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala +++ b/src/main/scala/com/johnsnowlabs/util/CoNLLGenerator.scala @@ -20,7 +20,7 @@ import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} -import org.apache.commons.lang.StringEscapeUtils.escapeJava +import org.apache.commons.text.StringEscapeUtils.escapeJava import scala.collection.mutable.ArrayBuffer import scala.util.Try @@ -89,6 +89,7 @@ object CoNLLGenerator { CoNLLDataset .coalesce(1) .write + .mode("overwrite") .format("com.databricks.spark.csv") .options(scala.collection.Map("delimiter" -> " ", "emptyValue" -> "")) .save(outputPath) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala index bb8f4d291ef18d..1053fffe4309f3 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/SparkSessionTest.scala @@ -35,6 +35,8 @@ trait SparkSessionTest extends BeforeAndAfterAll { this: Suite => val emptyDataSet: Dataset[_] = PipelineModels.dummyDataset val pipeline = new Pipeline() + println(s"Spark version: ${spark.version}") + override def beforeAll(): Unit = { super.beforeAll() diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala index 9db12399c79392..dcba5ea98928cf 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/norvig/NorvigSweetingBehaviors.scala @@ -168,7 +168,7 @@ trait NorvigSweetingBehaviors { this: AnyFlatSpec => "Unknown exception. Please check Spark version for correct handling." } - assert(caught.getMessage == expectedErrorMessage) + assert(caught.getMessage.contains(expectedErrorMessage)) } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala index 2f5afaedd71d58..77bd4108f5b8ee 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala @@ -299,7 +299,7 @@ trait SymmetricDeleteBehaviors { spell.fit(trainDataSet) } - assert(caught.getMessage == expectedErrorMessage) + assert(caught.getMessage.contains(expectedErrorMessage)) } } diff --git a/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala index 9a9c2f18134328..be01a7278b3ac8 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/util/CoNLLGeneratorTestSpec.scala @@ -168,7 +168,7 @@ class CoNLLGeneratorTestSpec extends AnyFlatSpec { assert(fileContents == testNERText) } - "The generator" should "work even if token metadata has non-ints" in { + "The generator" should "work even if token metadata has non-ints" taggedAs SlowTest in { val df = ResourceHelper.spark.read.load( "src/test/resources/conllgenerator/conllgenerator_nonint_token_metadata.parquet") From 52555387b612c19939f6e6cd604f366e92c5d4c3 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Thu, 27 Apr 2023 15:49:26 +0200 Subject: [PATCH 04/32] Add unit test for sark 3.4 - fix GA job without spark version - fix sark34 references in build.sbt --- .github/workflows/build_and_test.yml | 52 +++++++++++++++++++++++----- project/Dependencies.scala | 4 +-- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 243191303fced0..07868e9b9ddaf8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -31,13 +31,13 @@ on: - 'main' jobs: - spark33: + spark34: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest env: TF_CPP_MIN_LOG_LEVEL: 3 JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.3.x + name: Build and Test on Apache Spark 3.4.x steps: - uses: actions/checkout@v3 @@ -54,19 +54,55 @@ jobs: - name: Install Python packages (Python 3.7) run: | python -m pip install --upgrade pip - pip install pyspark==3.3.1 numpy pytest - - name: Build Spark NLP on Apache Spark 3.3.0 + pip install pyspark==3.4.0 numpy pytest + - name: Build Spark NLP on Apache Spark 3.4.0 run: | brew install sbt - sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.3.x + sbt -mem 4096 -Dis_spark34=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.4.x run: | sbt -mem 4096 coverage test - name: Upload coverage data to Coveralls run: sbt coverageReport coveralls env: COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COVERALLS_FLAG_NAME: Apache Spark 3.3.x - Scala 2.12 + COVERALLS_FLAG_NAME: Apache Spark 3.4.x - Scala 2.12 + - name: Test Spark NLP in Python - Apache Spark 3.4.x + run: | + cd python + python3.7 -m pytest -v -m fast + + spark33: + if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" + runs-on: macos-latest + env: + TF_CPP_MIN_LOG_LEVEL: 3 + JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" + name: Build and Test on Apache Spark 3.3.x + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'adopt' + java-version: '8' + cache: 'sbt' + - name: Install Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7.7 + architecture: x64 + - name: Install Python packages (Python 3.7) + run: | + python -m pip install --upgrade pip + pip install pyspark==3.3.1 numpy pytest + - name: Build Spark NLP on Apache Spark 3.3.1 + run: | + brew install sbt + sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.3.x + run: | + sbt -mem 4096 test - name: Test Spark NLP in Python - Apache Spark 3.3.x run: | cd python @@ -99,7 +135,7 @@ jobs: - name: Build Spark NLP on Apache Spark 3.2.3 run: | brew install sbt - sbt -mem 4096 clean assemblyAndCopy + sbt -mem 4096 -Dis_spark32=true clean assemblyAndCopy - name: Test Spark NLP in Scala - Apache Spark 3.2.x run: | sbt -mem 4096 test diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 2956315905c7f9..f36d7f528d3c54 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -22,7 +22,7 @@ object Dependencies { val is_spark31: String = System.getProperty("is_spark31", "false") val is_spark32: String = System.getProperty("is_spark32", "false") val is_spark33: String = System.getProperty("is_spark33", "false") - val is_spark34: String = System.getProperty("is_spark33", "true") + val is_spark34: String = System.getProperty("is_spark34", "true") val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34) @@ -53,7 +53,7 @@ object Dependencies { spark31Ver } else if (is_spark32.equals("true")) { spark32Ver - } else if (is_spark32.equals("true")) { + } else if (is_spark33.equals("true")) { spark33Ver } else { /* default spark version */ From ffe01da6ca58dbe7dc9db47e75470173e613ecc8 Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Thu, 27 Apr 2023 17:07:35 +0200 Subject: [PATCH 05/32] SPARKNLP-828: Raise error when exceeding max input length - Python side now also throws an exception if max length exceeds 512 --- docs/en/transformer_entries/MarianTransformer.md | 4 ++++ .../sparknlp/annotator/seq2seq/marian_transformer.py | 10 +++++++++- .../nlp/annotators/seq2seq/MarianTransformer.scala | 4 ++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/en/transformer_entries/MarianTransformer.md b/docs/en/transformer_entries/MarianTransformer.md index cee944479a2974..7533ed00877882 100644 --- a/docs/en/transformer_entries/MarianTransformer.md +++ b/docs/en/transformer_entries/MarianTransformer.md @@ -13,6 +13,10 @@ development. MarianTransformer uses the models trained by MarianNMT. It is currently the engine behind the Microsoft Translator Neural Machine Translation services and being deployed by many companies, organizations and research projects. +Note that this model only supports inputs up to 512 tokens. If you are working with longer +inputs, consider splitting them first. For example, you can use the SentenceDetectorDL annotator to +split longer texts into sentences. + Pretrained models can be loaded with `pretrained` of the companion object: ``` val marian = MarianTransformer.pretrained() diff --git a/python/sparknlp/annotator/seq2seq/marian_transformer.py b/python/sparknlp/annotator/seq2seq/marian_transformer.py index bc9c318aecf3f8..2ead24b182afc3 100755 --- a/python/sparknlp/annotator/seq2seq/marian_transformer.py +++ b/python/sparknlp/annotator/seq2seq/marian_transformer.py @@ -30,6 +30,11 @@ class MarianTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): Translation services and being deployed by many companies, organizations and research projects. + Note that this model only supports inputs up to 512 tokens. If you are + working with longer inputs, consider splitting them first. For example, you + can use the SentenceDetectorDL annotator to split longer texts into + sentences. + Pretrained models can be loaded with :meth:`.pretrained` of the companion object: @@ -176,13 +181,16 @@ def setLangId(self, value): def setMaxInputLength(self, value): """Sets the maximum length for encoder inputs (source language texts), - by default 40. + by default 40. The value should be less than 512, as the Marian Transformer does not + support inputs longer than 512 tokens. Parameters ---------- value : int The maximum length for encoder inputs (source language texts) """ + if value > 512: + raise ValueError("MarianTransformer model does not support sequences longer than 512.") return self._set(maxInputLength=value) def setMaxOutputLength(self, value): diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala index 381648c91b1c1d..759122dfc08d64 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala @@ -48,6 +48,10 @@ import org.apache.spark.sql.SparkSession * It is currently the engine behind the Microsoft Translator Neural Machine Translation services * and being deployed by many companies, organizations and research projects. * + * Note that this model only supports inputs up to 512 tokens. If you are working with longer + * inputs, consider splitting them first. For example, you can use the SentenceDetectorDL + * annotator to split longer texts into sentences first. + * * Pretrained models can be loaded with `pretrained` of the companion object: * {{{ * val marian = MarianTransformer.pretrained() From cce5dc84c94fc8397dc922c9d323455b696eb8dc Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Mon, 1 May 2023 11:51:50 +0200 Subject: [PATCH 06/32] SPARKNLP-828: Add Input limit to all relevant transformer-based Annotators - Added HasMaxSentenceLengthLimit mix-in to check for valid value for maxSentenceLength - Appended tests with new test case for this - Added missing tests for some annotators --- .../albert_for_question_answering.py | 16 +---- .../albert_for_sequence_classification.py | 18 +----- .../albert_for_token_classification.py | 18 +----- .../bert_for_question_answering.py | 24 ++------ .../bert_for_sequence_classification.py | 18 +----- .../bert_for_token_classification.py | 18 +----- .../bert_for_zero_shot_classification.py | 24 ++------ .../camembert_for_question_answering.py | 18 +----- .../camembert_for_sequence_classification.py | 18 +----- .../camembert_for_token_classification.py | 18 +----- .../deberta_for_question_answering.py | 18 +----- .../deberta_for_sequence_classification.py | 19 +----- .../deberta_for_token_classification.py | 18 +----- .../distil_bert_for_question_answering.py | 18 +----- ...distil_bert_for_sequence_classification.py | 18 +----- .../distil_bert_for_token_classification.py | 18 +----- ...istil_bert_for_zero_shot_classification.py | 18 +----- .../longformer_for_question_answering.py | 18 +----- .../longformer_for_sequence_classification.py | 18 +----- .../longformer_for_token_classification.py | 18 +----- .../roberta_for_question_answering.py | 18 +----- .../roberta_for_sequence_classification.py | 18 +----- .../xlm_roberta_for_question_answering.py | 18 +----- ...xlm_roberta_for_sequence_classification.py | 18 +----- .../xlm_roberta_for_token_classification.py | 18 +----- .../xlnet_for_sequence_classification.py | 18 +----- .../xlnet_for_token_classification.py | 18 +----- .../annotator/coref/spanbert_coref.py | 18 +----- .../annotator/embeddings/albert_embeddings.py | 18 +----- .../annotator/embeddings/bert_embeddings.py | 18 +----- .../embeddings/bert_sentence_embeddings.py | 26 ++------ .../embeddings/camembert_embeddings.py | 20 +------ .../embeddings/deberta_embeddings.py | 18 +----- .../embeddings/distil_bert_embeddings.py | 18 +----- .../embeddings/longformer_embeddings.py | 18 +----- .../embeddings/roberta_embeddings.py | 18 +----- .../embeddings/roberta_sentence_embeddings.py | 26 ++------ .../embeddings/xlm_roberta_embeddings.py | 18 +----- .../xlm_roberta_sentence_embeddings.py | 26 ++------ .../annotator/embeddings/xlnet_embeddings.py | 18 +----- python/sparknlp/common/properties.py | 41 +++++++++++++ ...albert_for_sequence_classification_test.py | 55 +++++++++++++++++ .../albert_for_token_classification_test.py | 15 +++-- .../bert_for_question_answering_test.py | 49 +++++++++++++++ .../bert_for_sequence_classification_test.py | 59 +++++++++++++++++++ .../bert_for_token_classification_test.py | 13 ++-- .../bert_for_zero_shot_classification_test.py | 17 +++--- .../camembert_for_question_aswering_test.py | 17 +++--- ...embert_for_sequence_classification_test.py | 55 +++++++++++++++++ ...camembert_for_token_classification_test.py | 12 ++-- .../deberta_for_question_answering_test.py | 49 +++++++++++++++ ...eberta_for_sequence_classification_test.py | 15 +++-- .../deberta_for_token_classification_test.py | 14 +++-- ...l_bert_for_sequence_classification_test.py | 15 +++-- ..._bert_for_zero_shot_classification_test.py | 17 +++--- .../distilbert_for_question_answering_test.py | 48 +++++++++++++++ ...istilbert_for_token_classification_test.py | 54 +++++++++++++++++ .../longformer_for_question_answering_test.py | 51 ++++++++++++++++ ...former_for_sequence_classification_test.py | 57 ++++++++++++++++++ ...ongformer_for_token_classification_test.py | 15 +++-- .../roberta_for_question_answering_test.py | 49 +++++++++++++++ ...oberta_for_sequence_classification_test.py | 15 +++-- ...xlm_roberta_for_question_answering_test.py | 49 +++++++++++++++ ...oberta_for_sequence_classification_test.py | 16 ++--- ...m_roberta_for_token_classification_test.py | 13 ++-- .../xlnet_for_sequence_classification_test.py | 55 +++++++++++++++++ .../xlnet_for_token_classification_test.py | 14 +++-- python/test/annotator/common/__init__.py | 0 .../common/has_max_sentence_length_test.py | 30 ++++++++++ .../annotator/coref/spanbert_coref_test.py | 16 ++--- .../embeddings/albert_embeddings_test.py | 13 ++-- .../embeddings/bert_embeddings_test.py | 13 ++-- .../bert_sentence_embeddings_test.py | 48 +++++++++++++++ .../embeddings/camembert_embeddings_test.py | 12 ++-- .../embeddings/deberta_embeddings_test.py | 50 ++++++++++++++++ .../embeddings/distilbert_embeddings_test.py | 50 ++++++++++++++++ .../embeddings/longformer_embeddings_test.py | 50 ++++++++++++++++ .../embeddings/roberta_embeddings_test.py | 50 ++++++++++++++++ .../roberta_sentence_embeddings_test.py | 13 ++-- .../embeddings/xlm_roberta_embeddings_test.py | 50 ++++++++++++++++ .../embeddings/xlnet_embeddings_test.py | 11 ++-- 81 files changed, 1266 insertions(+), 776 deletions(-) create mode 100644 python/test/annotator/classifier_dl/albert_for_sequence_classification_test.py create mode 100644 python/test/annotator/classifier_dl/bert_for_question_answering_test.py create mode 100644 python/test/annotator/classifier_dl/bert_for_sequence_classification_test.py create mode 100644 python/test/annotator/classifier_dl/camembert_for_sequence_classification_test.py create mode 100644 python/test/annotator/classifier_dl/deberta_for_question_answering_test.py rename python/test/annotator/{embeddings => classifier_dl}/deberta_for_token_classification_test.py (79%) create mode 100644 python/test/annotator/classifier_dl/distilbert_for_question_answering_test.py create mode 100644 python/test/annotator/classifier_dl/distilbert_for_token_classification_test.py create mode 100644 python/test/annotator/classifier_dl/longformer_for_question_answering_test.py create mode 100644 python/test/annotator/classifier_dl/longformer_for_sequence_classification_test.py create mode 100644 python/test/annotator/classifier_dl/roberta_for_question_answering_test.py create mode 100644 python/test/annotator/classifier_dl/xlm_roberta_for_question_answering_test.py create mode 100644 python/test/annotator/classifier_dl/xlnet_for_sequence_classification_test.py create mode 100644 python/test/annotator/common/__init__.py create mode 100644 python/test/annotator/common/has_max_sentence_length_test.py create mode 100644 python/test/annotator/embeddings/bert_sentence_embeddings_test.py create mode 100644 python/test/annotator/embeddings/deberta_embeddings_test.py create mode 100644 python/test/annotator/embeddings/distilbert_embeddings_test.py create mode 100644 python/test/annotator/embeddings/longformer_embeddings_test.py create mode 100644 python/test/annotator/embeddings/roberta_embeddings_test.py create mode 100644 python/test/annotator/embeddings/xlm_roberta_embeddings_test.py diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py index 358c56cfd060e2..113cdbfb801184 100755 --- a/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/albert_for_question_answering.py @@ -18,7 +18,8 @@ class AlbertForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """AlbertForQuestionAnswering can load ALBERT Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class AlbertForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with " @@ -117,15 +113,7 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForQuestionAnswering", diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py index 473b1b028d6354..fa9e7b5168b709 100755 --- a/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/albert_for_sequence_classification.py @@ -20,7 +20,8 @@ class AlbertForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """AlbertForSequenceClassification can load Albert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class AlbertForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py index eedaf3f98edcd6..814ec342a5589b 100755 --- a/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/albert_for_token_classification.py @@ -19,7 +19,8 @@ class AlbertForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """AlbertForTokenClassification can load ALBERT Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -100,11 +101,6 @@ class AlbertForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -126,16 +122,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py index f8b4e95c2b1ede..27b61bdccdd953 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_question_answering.py @@ -18,7 +18,8 @@ class BertForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """BertForQuestionAnswering can load BERT Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -67,14 +68,14 @@ class BertForQuestionAnswering(AnnotatorModel, >>> from pyspark.ml import Pipeline >>> documentAssembler = MultiDocumentAssembler() \\ ... .setInputCols(["question", "context"]) \\ - ... .setOutputCol(["document_question", "document_context"]) - >>> spanClassifier = BertForQuestionAnswering.pretrained() \\ + ... .setOutputCols(["document_question", "document_context"]) + >>> questionAnswering = BertForQuestionAnswering.pretrained() \\ ... .setInputCols(["document_question", "document_context"]) \\ ... .setOutputCol("answer") \\ ... .setCaseSensitive(False) >>> pipeline = Pipeline().setStages([ ... documentAssembler, - ... spanClassifier + ... questionAnswering ... ]) >>> data = spark.createDataFrame([["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", "context") >>> result = pipeline.fit(data).transform(data) @@ -91,11 +92,6 @@ class BertForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.BertForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py index 959afdea292391..8ae3dbf185df6a 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_sequence_classification.py @@ -21,7 +21,8 @@ class BertForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """BertForSequenceClassification can load Bert Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -105,11 +106,6 @@ class per document by averaging probabilities in all sentences, by outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -135,16 +131,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py index 2c1d39bf06a149..bef1f945b7ba06 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_token_classification.py @@ -19,7 +19,8 @@ class BertForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """BertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -98,11 +99,6 @@ class BertForTokenClassification(AnnotatorModel, outputAnnotatorType: AnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -124,16 +120,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.BertForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py index 5287ea6f1b216b..afce86ed7e7b6b 100755 --- a/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py +++ b/python/sparknlp/annotator/classifier_dl/bert_for_zero_shot_classification.py @@ -21,7 +21,8 @@ class BertForZeroShotClassification(AnnotatorModel, HasBatchedAnnotate, HasClassifierActivationProperties, HasCandidateLabelsProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """BertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language inference) tasks. Equivalent of `BertForSequenceClassification` models, but these models don't require a hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more @@ -94,7 +95,7 @@ class per document by averaging probabilities in all sentences, by ... tokenizer, ... sequenceClassifier ... ]) - >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text") + >>> data = spark.createDataFrame([["I loved this movie when I was a child."], ["It was pretty boring."]]).toDF("text") >>> result = pipeline.fit(data).transform(data) >>> result.select("label.result").show(truncate=False) +------+ @@ -110,11 +111,6 @@ class per document by averaging probabilities in all sentences, by outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -140,16 +136,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT @@ -207,9 +193,9 @@ def pretrained(name="bert_base_cased_zero_shot_classifier_xnli", lang="en", remo name : str, optional Name of the pretrained model, by default "bert_base_cased_zero_shot_classifier_xnli" - lang : str, optional + lang : str, optional Language of the pretrained model, by default "en" - remote_loc : str, optional + remote_loc : str, optional Optional remote address of the resource, by default None. Will use Spark NLPs repositories otherwise. diff --git a/python/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py index 7ea479326eecce..02828e034c5125 100755 --- a/python/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/camembert_for_question_answering.py @@ -18,7 +18,8 @@ class CamemBertForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """CamemBertForQuestionAnswering can load CamemBERT Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class CamemBertForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py index 6defc968724003..305f9d9453baf0 100644 --- a/python/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/camembert_for_sequence_classification.py @@ -20,7 +20,8 @@ class CamemBertForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """CamemBertForSequenceClassification can load CamemBERT Models with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class CamemBertForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences, by default True. diff --git a/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py index 9c7445c031bdca..19fd4780f8e583 100755 --- a/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/camembert_for_token_classification.py @@ -18,7 +18,8 @@ class CamemBertForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """CamemBertForTokenClassification can load CamemBERT Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -94,11 +95,6 @@ class CamemBertForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -120,16 +116,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.CamemBertForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py index 10427ba0f1120a..bc2c09c6a434a3 100755 --- a/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/deberta_for_question_answering.py @@ -18,7 +18,8 @@ class DeBertaForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DeBertaForQuestionAnswering can load DeBERTa Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class DeBertaForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py index cb3153f9ecfcd3..9ca03167f9ed7b 100755 --- a/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/deberta_for_sequence_classification.py @@ -19,7 +19,8 @@ class DeBertaForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DeBertaForSequenceClassification can load DeBERTa v2 & v3 Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -100,11 +101,6 @@ class DeBertaForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -130,16 +126,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as @@ -210,4 +196,3 @@ def pretrained(name="deberta_base_sequence_classifier_imdb", lang="en", remote_l """ from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(DeBertaForSequenceClassification, name, lang, remote_loc) - diff --git a/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py index f464b3a0c85682..63936e6fc7c6cb 100755 --- a/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/deberta_for_token_classification.py @@ -19,7 +19,8 @@ class DeBertaForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DeBertaForTokenClassification can load DeBERTa v2&v3 Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -98,11 +99,6 @@ class DeBertaForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -124,16 +120,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DeBertaForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py index b16f616520ebf8..15230c58b69791 100755 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_question_answering.py @@ -18,7 +18,8 @@ class DistilBertForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DistilBertForQuestionAnswering can load DistilBERT Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class DistilBertForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py index a3c0d446181c64..16a7222cb1b1a0 100755 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_sequence_classification.py @@ -20,7 +20,8 @@ class DistilBertForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DistilBertForSequenceClassification can load DistilBERT Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class DistilBertForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py index 218ba5336bed1a..984f5dfd732179 100755 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_token_classification.py @@ -19,7 +19,8 @@ class DistilBertForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DistilBertForTokenClassification can load Bert Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -96,11 +97,6 @@ class DistilBertForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -122,16 +118,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py index f4444b75e9b81b..22d0e9f6fd9ac4 100644 --- a/python/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py +++ b/python/sparknlp/annotator/classifier_dl/distil_bert_for_zero_shot_classification.py @@ -21,7 +21,8 @@ class DistilBertForZeroShotClassification(AnnotatorModel, HasBatchedAnnotate, HasClassifierActivationProperties, HasCandidateLabelsProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DistilBertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language inference) tasks. Equivalent of `DistilBertForSequenceClassification` models, but these models don't require a hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more @@ -110,11 +111,6 @@ class per document by averaging probabilities in all sentences, by outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -140,16 +136,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as DistilBERT diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py index a0ead26735fc43..c6f96e43e0480d 100755 --- a/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/longformer_for_question_answering.py @@ -18,7 +18,8 @@ class LongformerForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasLongMaxSentenceLengthLimit): """LongformerForQuestionAnswering can load Longformer Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class LongformerForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.LongformerForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py index 435a851501dd93..869ea11540a3af 100755 --- a/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/longformer_for_sequence_classification.py @@ -20,7 +20,8 @@ class LongformerForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasLongMaxSentenceLengthLimit): """LongformerForSequenceClassification can load Longformer Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class LongformerForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py index bb5d00705d9707..b9d4a4b21178ce 100755 --- a/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/longformer_for_token_classification.py @@ -19,7 +19,8 @@ class LongformerForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasLongMaxSentenceLengthLimit): """LongformerForTokenClassification can load Longformer Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -97,11 +98,6 @@ class LongformerForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -123,16 +119,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.LongformerForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py index 19c042e32fe662..27a6bfdb979ef4 100755 --- a/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/roberta_for_question_answering.py @@ -18,7 +18,8 @@ class RoBertaForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """RoBertaForQuestionAnswering can load RoBERTa Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class RoBertaForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py index eeb2c3e8032c15..6b26347c4397ca 100755 --- a/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/roberta_for_sequence_classification.py @@ -20,7 +20,8 @@ class RoBertaForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """RoBertaForSequenceClassification can load RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class RoBertaForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py index e66b257850759b..97a2a38c08df86 100755 --- a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_question_answering.py @@ -18,7 +18,8 @@ class XlmRoBertaForQuestionAnswering(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """XlmRoBertaForQuestionAnswering can load XLM-RoBERTa Models with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of the hidden-states output to compute span start logits and span end logits). @@ -91,11 +92,6 @@ class XlmRoBertaForQuestionAnswering(AnnotatorModel, outputAnnotatorType = AnnotatorType.CHUNK - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -115,16 +111,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForQuestionAnswering", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py index a7774e6f38842a..2db4b3b7ae7b2a 100755 --- a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_sequence_classification.py @@ -20,7 +20,8 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """XlmRoBertaForSequenceClassification can load XLM-RoBERTa Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class XlmRoBertaForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py index 49b181e4c2dc16..5bdc08f9496fc6 100755 --- a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_token_classification.py @@ -19,7 +19,8 @@ class XlmRoBertaForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """XlmRoBertaForTokenClassification can load XLM-RoBERTa Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -94,11 +95,6 @@ class XlmRoBertaForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -120,16 +116,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py b/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py index 11ecf8f13a3157..8a6c6ce5111e16 100755 --- a/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlnet_for_sequence_classification.py @@ -20,7 +20,8 @@ class XlnetForSequenceClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, HasClassifierActivationProperties, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """XlnetForSequenceClassification can load XLNet Models with sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for multi-class document classification tasks. @@ -104,11 +105,6 @@ class XlnetForSequenceClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.CATEGORY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -134,16 +130,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setCoalesceSentences(self, value): """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as BERT (512 tokens), this parameter helps feeding all the sentences diff --git a/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py b/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py index 8853951128536c..e0d1730b557a5b 100755 --- a/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py +++ b/python/sparknlp/annotator/classifier_dl/xlnet_for_token_classification.py @@ -19,7 +19,8 @@ class XlnetForTokenClassification(AnnotatorModel, HasCaseSensitiveProperties, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """XlnetForTokenClassification can load XLNet Models with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. @@ -97,11 +98,6 @@ class XlnetForTokenClassification(AnnotatorModel, outputAnnotatorType = AnnotatorType.NAMED_ENTITY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -123,16 +119,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 128. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlnetForTokenClassification", java_model=None): diff --git a/python/sparknlp/annotator/coref/spanbert_coref.py b/python/sparknlp/annotator/coref/spanbert_coref.py index c5e75a5a932743..b0d9d4761e42e0 100644 --- a/python/sparknlp/annotator/coref/spanbert_coref.py +++ b/python/sparknlp/annotator/coref/spanbert_coref.py @@ -20,7 +20,8 @@ class SpanBertCorefModel(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """ A coreference resolution model based on SpanBert. @@ -114,11 +115,6 @@ class SpanBertCorefModel(AnnotatorModel, outputAnnotatorType = AnnotatorType.DEPENDENCY - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - maxSegmentLength = Param(Params._dummy(), "maxSegmentLength", "Max segment length", @@ -144,16 +140,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setMaxSegmentLength(self, value): """Sets max segment length diff --git a/python/sparknlp/annotator/embeddings/albert_embeddings.py b/python/sparknlp/annotator/embeddings/albert_embeddings.py index 5290943d34a8e2..72eef1e82d87e4 100755 --- a/python/sparknlp/annotator/embeddings/albert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/albert_embeddings.py @@ -21,7 +21,8 @@ class AlbertEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """ALBERT: A Lite Bert For Self-Supervised Learning Of Language Representations - Google Research, Toyota Technological Institute at Chicago @@ -163,11 +164,6 @@ class AlbertEmbeddings(AnnotatorModel, "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListInt) - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - def setConfigProtoBytes(self, b): """Sets configProto from tensorflow, serialized into byte array. @@ -178,16 +174,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings", java_model=None): super(AlbertEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/bert_embeddings.py b/python/sparknlp/annotator/embeddings/bert_embeddings.py index a55ea7744538ce..95d98261640af5 100755 --- a/python/sparknlp/annotator/embeddings/bert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/bert_embeddings.py @@ -20,7 +20,8 @@ class BertEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef, - HasBatchedAnnotate): + HasBatchedAnnotate, + HasMaxSentenceLengthLimit): """Token-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides @@ -134,11 +135,6 @@ class BertEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -154,16 +150,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BertEmbeddings", java_model=None): super(BertEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py b/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py index 700af88a8be81e..b33af19584279b 100755 --- a/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/bert_sentence_embeddings.py @@ -17,11 +17,12 @@ class BertSentenceEmbeddings(AnnotatorModel, - HasEmbeddingsProperties, - HasCaseSensitiveProperties, - HasStorageRef, - HasBatchedAnnotate, - HasEngine): + HasEmbeddingsProperties, + HasCaseSensitiveProperties, + HasStorageRef, + HasBatchedAnnotate, + HasEngine, + HasMaxSentenceLengthLimit): """Sentence-level embeddings using BERT. BERT (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the @@ -133,11 +134,6 @@ class BertSentenceEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - isLong = Param(Params._dummy(), "isLong", "Use Long type instead of Int type for inputs buffer - Some Bert models require Long instead of Int.", @@ -158,16 +154,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - def setIsLong(self, value): """Sets whether to use Long type instead of Int type for inputs buffer. diff --git a/python/sparknlp/annotator/embeddings/camembert_embeddings.py b/python/sparknlp/annotator/embeddings/camembert_embeddings.py index 1171d6b4ea082e..2461324b143b23 100755 --- a/python/sparknlp/annotator/embeddings/camembert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/camembert_embeddings.py @@ -21,7 +21,8 @@ class CamemBertEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """The CamemBERT model was proposed in CamemBERT: a Tasty French Language Model by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah, and Benoît Sagot. @@ -143,13 +144,6 @@ class CamemBertEmbeddings(AnnotatorModel, TypeConverters.toListInt, ) - maxSentenceLength = Param( - Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt, - ) - def setConfigProtoBytes(self, b): """Sets configProto from tensorflow, serialized into byte array. @@ -160,16 +154,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.CamemBertEmbeddings", java_model=None): super(CamemBertEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/deberta_embeddings.py b/python/sparknlp/annotator/embeddings/deberta_embeddings.py index 7898ffbf813dc0..24462a0022b724 100755 --- a/python/sparknlp/annotator/embeddings/deberta_embeddings.py +++ b/python/sparknlp/annotator/embeddings/deberta_embeddings.py @@ -20,7 +20,8 @@ class DeBertaEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """The DeBERTa model was proposed in DeBERTa: Decoding-enhanced BERT with Disentangled Attention by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s @@ -141,11 +142,6 @@ class DeBertaEmbeddings(AnnotatorModel, "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListInt) - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - def setConfigProtoBytes(self, b): """Sets configProto from tensorflow, serialized into byte array. @@ -156,16 +152,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DeBertaEmbeddings", java_model=None): super(DeBertaEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py b/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py index 0afc3a7cedc788..6ae825aec01fde 100755 --- a/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py +++ b/python/sparknlp/annotator/embeddings/distil_bert_embeddings.py @@ -21,7 +21,8 @@ class DistilBertEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than ``bert-base-uncased``, runs 60% faster while preserving over 95% of BERT's performances as measured @@ -149,11 +150,6 @@ class DistilBertEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -169,16 +165,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings", java_model=None): super(DistilBertEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/longformer_embeddings.py b/python/sparknlp/annotator/embeddings/longformer_embeddings.py index cab8fab4f3c478..ee286a8b3ed0f9 100755 --- a/python/sparknlp/annotator/embeddings/longformer_embeddings.py +++ b/python/sparknlp/annotator/embeddings/longformer_embeddings.py @@ -21,7 +21,8 @@ class LongformerEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasLongMaxSentenceLengthLimit): """Longformer is a transformer model for long documents. The Longformer model was presented in `Longformer: The Long-Document Transformer` by Iz Beltagy, Matthew E. Peters, Arman Cohan. longformer-base-4096 is a BERT-like @@ -139,11 +140,6 @@ class LongformerEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -159,16 +155,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process, by default 1024. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.LongformerEmbeddings", java_model=None): super(LongformerEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/roberta_embeddings.py b/python/sparknlp/annotator/embeddings/roberta_embeddings.py index 24c6596e5c525b..33b5cfc1282f7a 100755 --- a/python/sparknlp/annotator/embeddings/roberta_embeddings.py +++ b/python/sparknlp/annotator/embeddings/roberta_embeddings.py @@ -21,7 +21,8 @@ class RoBertaEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """Creates word embeddings using RoBERTa. The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT @@ -151,11 +152,6 @@ class RoBertaEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -171,16 +167,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaEmbeddings", java_model=None): super(RoBertaEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py b/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py index b1ca8fd3fbc8e0..8a6d02c245af40 100755 --- a/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/roberta_sentence_embeddings.py @@ -17,11 +17,12 @@ class RoBertaSentenceEmbeddings(AnnotatorModel, - HasEmbeddingsProperties, - HasCaseSensitiveProperties, - HasStorageRef, - HasBatchedAnnotate, - HasEngine): + HasEmbeddingsProperties, + HasCaseSensitiveProperties, + HasStorageRef, + HasBatchedAnnotate, + HasEngine, + HasMaxSentenceLengthLimit): """Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018. It builds on @@ -119,11 +120,6 @@ class RoBertaSentenceEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -139,16 +135,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings", java_model=None): super(RoBertaSentenceEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py b/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py index 46fa091ba05651..8c1b2f31adf7b1 100755 --- a/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py +++ b/python/sparknlp/annotator/embeddings/xlm_roberta_embeddings.py @@ -21,7 +21,8 @@ class XlmRoBertaEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale` by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzman, Edouard @@ -151,11 +152,6 @@ class XlmRoBertaEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.WORD_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -171,16 +167,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlmRoBertaEmbeddings", java_model=None): super(XlmRoBertaEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py b/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py index 4d65e20f6ba7ea..89a5a7e86cc9b7 100755 --- a/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py +++ b/python/sparknlp/annotator/embeddings/xlm_roberta_sentence_embeddings.py @@ -17,11 +17,12 @@ class XlmRoBertaSentenceEmbeddings(AnnotatorModel, - HasEmbeddingsProperties, - HasCaseSensitiveProperties, - HasStorageRef, - HasBatchedAnnotate, - HasEngine): + HasEmbeddingsProperties, + HasCaseSensitiveProperties, + HasStorageRef, + HasBatchedAnnotate, + HasEngine, + HasMaxSentenceLengthLimit): """Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based @@ -122,11 +123,6 @@ class XlmRoBertaSentenceEmbeddings(AnnotatorModel, outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - configProtoBytes = Param(Params._dummy(), "configProtoBytes", "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", @@ -142,16 +138,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings", java_model=None): super(XlmRoBertaSentenceEmbeddings, self).__init__( diff --git a/python/sparknlp/annotator/embeddings/xlnet_embeddings.py b/python/sparknlp/annotator/embeddings/xlnet_embeddings.py index 31d99263669853..44b406ceaa7b6b 100755 --- a/python/sparknlp/annotator/embeddings/xlnet_embeddings.py +++ b/python/sparknlp/annotator/embeddings/xlnet_embeddings.py @@ -21,7 +21,8 @@ class XlnetEmbeddings(AnnotatorModel, HasCaseSensitiveProperties, HasStorageRef, HasBatchedAnnotate, - HasEngine): + HasEngine, + HasMaxSentenceLengthLimit): """XlnetEmbeddings (XLNet): Generalized Autoregressive Pretraining for Language Understanding @@ -160,11 +161,6 @@ class XlnetEmbeddings(AnnotatorModel, "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", TypeConverters.toListInt) - maxSentenceLength = Param(Params._dummy(), - "maxSentenceLength", - "Max sentence length to process", - typeConverter=TypeConverters.toInt) - def setConfigProtoBytes(self, b): """Sets configProto from tensorflow, serialized into byte array. @@ -175,16 +171,6 @@ def setConfigProtoBytes(self, b): """ return self._set(configProtoBytes=b) - def setMaxSentenceLength(self, value): - """Sets max sentence length to process. - - Parameters - ---------- - value : int - Max sentence length to process - """ - return self._set(maxSentenceLength=value) - @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.XlnetEmbeddings", java_model=None): super(XlnetEmbeddings, self).__init__( diff --git a/python/sparknlp/common/properties.py b/python/sparknlp/common/properties.py index 79db876d7e2445..4fcff5e2399703 100644 --- a/python/sparknlp/common/properties.py +++ b/python/sparknlp/common/properties.py @@ -433,3 +433,44 @@ def setEntailmentIdParam(self, v): entailmentIdParam """ return self._set(entailmentIdParam=v) + + +class HasMaxSentenceLengthLimit: + # Default Value, can be overridden + max_length_limit = 512 + + maxSentenceLength = Param(Params._dummy(), + "maxSentenceLength", + "Max sentence length to process", + typeConverter=TypeConverters.toInt) + + def setMaxSentenceLength(self, value): + """Sets max sentence length to process. + + Note that a maximum limit exists depending on the model. If you are working with long single + sequences, consider splitting up the input first with another annotator e.g. SentenceDetector. + + Parameters + ---------- + value : int + Max sentence length to process + """ + if value > self.max_length_limit: + raise ValueError( + f"{self.__class__.__name__} models do not support token sequences longer than {self.max_length_limit}.\n" + f"Consider splitting up the input first with another annotator e.g. SentenceDetector.") + return self._set(maxSentenceLength=value) + + def getMaxSentenceLength(self): + """Gets max sentence of the model. + + Returns + ------- + int + Max sentence length to process + """ + return self.getOrDefault("maxSentenceLength") + + +class HasLongMaxSentenceLengthLimit(HasMaxSentenceLengthLimit): + max_length_limit = 4096 diff --git a/python/test/annotator/classifier_dl/albert_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/albert_for_sequence_classification_test.py new file mode 100644 index 00000000000000..ca7e3199d6c8da --- /dev/null +++ b/python/test/annotator/classifier_dl/albert_for_sequence_classification_test.py @@ -0,0 +1,55 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class AlbertForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = AlbertForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + albert = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + albert + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() + + print(self.classifier.getClasses()) + print(self.classifier.getBatchSize()) diff --git a/python/test/annotator/classifier_dl/albert_for_token_classification_test.py b/python/test/annotator/classifier_dl/albert_for_token_classification_test.py index fe0af064ed98aa..d88798569e7bff 100644 --- a/python/test/annotator/classifier_dl/albert_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/albert_for_token_classification_test.py @@ -18,25 +18,28 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest -@pytest.mark.fast -class AlbertForTokenClassificationTestSpec(unittest.TestCase): +@pytest.mark.slow +class AlbertForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = AlbertForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - token_classifier = AlbertForTokenClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("ner") + token_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/bert_for_question_answering_test.py b/python/test/annotator/classifier_dl/bert_for_question_answering_test.py new file mode 100644 index 00000000000000..3a8ce0969619d5 --- /dev/null +++ b/python/test/annotator/classifier_dl/bert_for_question_answering_test.py @@ -0,0 +1,49 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class BertForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.tested_annotator = BertForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + + def test_run(self): + documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline().setStages([ + documentAssembler, + questionAnswering + ]) + + data = SparkContextForTest.spark.createDataFrame( + [["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", + "context") + result = pipeline.fit(data).transform(data) + result.select("answer.result").show(truncate=False) diff --git a/python/test/annotator/classifier_dl/bert_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/bert_for_sequence_classification_test.py new file mode 100644 index 00000000000000..ab1ae02530f5a0 --- /dev/null +++ b/python/test/annotator/classifier_dl/bert_for_sequence_classification_test.py @@ -0,0 +1,59 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class BertForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = BertForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + doc_classifier = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + doc_classifier + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() + + print(self.classifier.getClasses()) + print(self.classifier.getBatchSize()) + + def test_maxSentenceLength(self): + with pytest.raises(ValueError): + self.classifier.setMaxSentenceLength(5000) diff --git a/python/test/annotator/classifier_dl/bert_for_token_classification_test.py b/python/test/annotator/classifier_dl/bert_for_token_classification_test.py index a1ff86e465dfb6..405a9d806897d9 100644 --- a/python/test/annotator/classifier_dl/bert_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/bert_for_token_classification_test.py @@ -18,25 +18,28 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class BertForTokenClassificationTestSpec(unittest.TestCase): +class BertForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = BertForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - token_classifier = BertForTokenClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("ner") + token_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/bert_for_zero_shot_classification_test.py b/python/test/annotator/classifier_dl/bert_for_zero_shot_classification_test.py index e7f9f76f7f23d7..bb851243d8cf66 100644 --- a/python/test/annotator/classifier_dl/bert_for_zero_shot_classification_test.py +++ b/python/test/annotator/classifier_dl/bert_for_zero_shot_classification_test.py @@ -16,29 +16,32 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class BertForZeroShotClassificationTestSpec(unittest.TestCase): +class BertForZeroShotClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.spark = SparkContextForTest.spark self.text = "I have a problem with my iphone that needs to be resolved asap!!" self.inputDataset = self.spark.createDataFrame([[self.text]]) \ .toDF("text") - def runTest(self): + self.tested_annotator = BertForZeroShotClassification \ + .pretrained("bert_base_cased_zero_shot_classifier_xnli") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") \ + .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]) + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - zero_shot_classifier = BertForZeroShotClassification \ - .pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") \ - .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]) + zero_shot_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/camembert_for_question_aswering_test.py b/python/test/annotator/classifier_dl/camembert_for_question_aswering_test.py index b91aeb7d844158..f15607bf66e563 100644 --- a/python/test/annotator/classifier_dl/camembert_for_question_aswering_test.py +++ b/python/test/annotator/classifier_dl/camembert_for_question_aswering_test.py @@ -16,11 +16,12 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class CamemBertForQuestionAnsweringTestSpec(unittest.TestCase): +class CamemBertForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.spark = SparkContextForTest.spark self.question = "Où est-ce que je vis?" @@ -28,17 +29,19 @@ def setUp(self): self.inputDataset = self.spark.createDataFrame([[self.question, self.context]]) \ .toDF("question", "context") - def runTest(self): - document_assembler = MultiDocumentAssembler() \ - .setInputCols("question", "context") \ - .setOutputCols("document_question", "document_context") - - qa_classifier = CamemBertForQuestionAnswering.pretrained() \ + self.tested_annotator = CamemBertForQuestionAnswering.pretrained() \ .setInputCols("document_question", "document_context") \ .setOutputCol("answer") \ .setCaseSensitive(True) \ .setMaxSentenceLength(512) + def test_run(self): + document_assembler = MultiDocumentAssembler() \ + .setInputCols("question", "context") \ + .setOutputCols("document_question", "document_context") + + qa_classifier = self.tested_annotator + pipeline = Pipeline(stages=[ document_assembler, qa_classifier diff --git a/python/test/annotator/classifier_dl/camembert_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/camembert_for_sequence_classification_test.py new file mode 100644 index 00000000000000..435653d161bd95 --- /dev/null +++ b/python/test/annotator/classifier_dl/camembert_for_sequence_classification_test.py @@ -0,0 +1,55 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class CamemBertForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = CamemBertForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + doc_classifier = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + doc_classifier + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() + + print(self.classifier.getClasses()) + print(self.classifier.getBatchSize()) diff --git a/python/test/annotator/classifier_dl/camembert_for_token_classification_test.py b/python/test/annotator/classifier_dl/camembert_for_token_classification_test.py index 1adadeab025aec..ba59e4fd880dc0 100644 --- a/python/test/annotator/classifier_dl/camembert_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/camembert_for_token_classification_test.py @@ -27,17 +27,19 @@ def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = CamemBertForTokenClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - token_classifier = CamemBertForTokenClassification \ - .pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("ner") + token_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/deberta_for_question_answering_test.py b/python/test/annotator/classifier_dl/deberta_for_question_answering_test.py new file mode 100644 index 00000000000000..be1dca27490dd8 --- /dev/null +++ b/python/test/annotator/classifier_dl/deberta_for_question_answering_test.py @@ -0,0 +1,49 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class DeBertaForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.tested_annotator = DeBertaForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + + def test_run(self): + documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline().setStages([ + documentAssembler, + questionAnswering + ]) + + data = SparkContextForTest.spark.createDataFrame( + [["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", + "context") + result = pipeline.fit(data).transform(data) + result.select("answer.result").show(truncate=False) diff --git a/python/test/annotator/classifier_dl/deberta_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/deberta_for_sequence_classification_test.py index 7621535fdcdece..66bfcdd4cdc1d0 100644 --- a/python/test/annotator/classifier_dl/deberta_for_sequence_classification_test.py +++ b/python/test/annotator/classifier_dl/deberta_for_sequence_classification_test.py @@ -18,26 +18,29 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class DeBertaForSequenceClassificationTestSpec(unittest.TestCase): +class DeBertaForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = DeBertaForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - doc_classifier = DeBertaForSequenceClassification \ - .pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") + doc_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/embeddings/deberta_for_token_classification_test.py b/python/test/annotator/classifier_dl/deberta_for_token_classification_test.py similarity index 79% rename from python/test/annotator/embeddings/deberta_for_token_classification_test.py rename to python/test/annotator/classifier_dl/deberta_for_token_classification_test.py index 5dfa3ee95fc1fe..624eb676935411 100644 --- a/python/test/annotator/embeddings/deberta_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/deberta_for_token_classification_test.py @@ -18,26 +18,28 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class DeBertaForTokenClassificationTestSpec(unittest.TestCase): +class DeBertaForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = DeBertaForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - doc_classifier = DeBertaForTokenClassification \ - .pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") + doc_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/distil_bert_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/distil_bert_for_sequence_classification_test.py index e402ac1a2dc8f3..2087682f820163 100644 --- a/python/test/annotator/classifier_dl/distil_bert_for_sequence_classification_test.py +++ b/python/test/annotator/classifier_dl/distil_bert_for_sequence_classification_test.py @@ -18,25 +18,29 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class DistilBertForSequenceClassificationTestSpec(unittest.TestCase): +class DistilBertForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = DistilBertForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - doc_classifier = DistilBertForSequenceClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") + doc_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, @@ -46,4 +50,3 @@ def runTest(self): model = pipeline.fit(self.data) model.transform(self.data).show() - diff --git a/python/test/annotator/classifier_dl/distil_bert_for_zero_shot_classification_test.py b/python/test/annotator/classifier_dl/distil_bert_for_zero_shot_classification_test.py index 1a869b3b0fb4ff..c4b153b24a2b48 100644 --- a/python/test/annotator/classifier_dl/distil_bert_for_zero_shot_classification_test.py +++ b/python/test/annotator/classifier_dl/distil_bert_for_zero_shot_classification_test.py @@ -16,29 +16,32 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class DistilBertForZeroShotClassificationTestSpec(unittest.TestCase): +class DistilBertForZeroShotClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.spark = SparkContextForTest.spark self.text = "I have a problem with my iphone that needs to be resolved asap!!" self.inputDataset = self.spark.createDataFrame([[self.text]]) \ .toDF("text") - def runTest(self): + self.tested_annotator = DistilBertForZeroShotClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") \ + .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]) + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - zero_shot_classifier = DistilBertForZeroShotClassification \ - .pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") \ - .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]) + zero_shot_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/distilbert_for_question_answering_test.py b/python/test/annotator/classifier_dl/distilbert_for_question_answering_test.py new file mode 100644 index 00000000000000..229f7e5794ad40 --- /dev/null +++ b/python/test/annotator/classifier_dl/distilbert_for_question_answering_test.py @@ -0,0 +1,48 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class DistilBertForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.tested_annotator = DistilBertForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + + def test_run(self): + documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline().setStages([ + documentAssembler, + ]) + + data = SparkContextForTest.spark.createDataFrame( + [["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", + "context") + result = pipeline.fit(data).transform(data) + result.select("answer.result").show(truncate=False) diff --git a/python/test/annotator/classifier_dl/distilbert_for_token_classification_test.py b/python/test/annotator/classifier_dl/distilbert_for_token_classification_test.py new file mode 100644 index 00000000000000..10a42ed64f8c26 --- /dev/null +++ b/python/test/annotator/classifier_dl/distilbert_for_token_classification_test.py @@ -0,0 +1,54 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class DistilBertForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = DistilBertForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + token_classifier = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + token_classifier + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() + + print(self.classifier.getClasses()) + print(self.classifier.getBatchSize()) diff --git a/python/test/annotator/classifier_dl/longformer_for_question_answering_test.py b/python/test/annotator/classifier_dl/longformer_for_question_answering_test.py new file mode 100644 index 00000000000000..789a75efb3625d --- /dev/null +++ b/python/test/annotator/classifier_dl/longformer_for_question_answering_test.py @@ -0,0 +1,51 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class LongformerForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + valid_max_length = 4096 + + def setUp(self): + self.tested_annotator = LongformerForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + + def test_run(self): + documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline().setStages([ + documentAssembler, + questionAnswering + ]) + + data = SparkContextForTest.spark.createDataFrame( + [["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", + "context") + result = pipeline.fit(data).transform(data) + result.select("answer.result").show(truncate=False) diff --git a/python/test/annotator/classifier_dl/longformer_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/longformer_for_sequence_classification_test.py new file mode 100644 index 00000000000000..d9d0972ebd3842 --- /dev/null +++ b/python/test/annotator/classifier_dl/longformer_for_sequence_classification_test.py @@ -0,0 +1,57 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class LongformerForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + valid_max_length = 4096 + + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = LongformerForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + doc_classifier = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + doc_classifier + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() + + print(self.classifier.getClasses()) + print(self.classifier.getBatchSize()) diff --git a/python/test/annotator/classifier_dl/longformer_for_token_classification_test.py b/python/test/annotator/classifier_dl/longformer_for_token_classification_test.py index e0f97580a0c394..9fdf74f335e0eb 100644 --- a/python/test/annotator/classifier_dl/longformer_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/longformer_for_token_classification_test.py @@ -18,25 +18,30 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class LongformerForTokenClassificationTestSpec(unittest.TestCase): +class LongformerForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + valid_max_length = 4096 + def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = LongformerForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - token_classifier = LongformerForTokenClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("ner") + token_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/roberta_for_question_answering_test.py b/python/test/annotator/classifier_dl/roberta_for_question_answering_test.py new file mode 100644 index 00000000000000..457a1cf648c567 --- /dev/null +++ b/python/test/annotator/classifier_dl/roberta_for_question_answering_test.py @@ -0,0 +1,49 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class RoBertaForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.tested_annotator = RoBertaForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + + def test_run(self): + documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline().setStages([ + documentAssembler, + questionAnswering + ]) + + data = SparkContextForTest.spark.createDataFrame( + [["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", + "context") + result = pipeline.fit(data).transform(data) + result.select("answer.result").show(truncate=False) diff --git a/python/test/annotator/classifier_dl/roberta_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/roberta_for_sequence_classification_test.py index dc9c8cbaf92659..8c79f266a7ece0 100644 --- a/python/test/annotator/classifier_dl/roberta_for_sequence_classification_test.py +++ b/python/test/annotator/classifier_dl/roberta_for_sequence_classification_test.py @@ -18,25 +18,29 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class RoBertaForSequenceClassificationTestSpec(unittest.TestCase): +class RoBertaForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = RoBertaForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - doc_classifier = RoBertaForSequenceClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") + doc_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, @@ -46,4 +50,3 @@ def runTest(self): model = pipeline.fit(self.data) model.transform(self.data).show() - diff --git a/python/test/annotator/classifier_dl/xlm_roberta_for_question_answering_test.py b/python/test/annotator/classifier_dl/xlm_roberta_for_question_answering_test.py new file mode 100644 index 00000000000000..15a97648a65803 --- /dev/null +++ b/python/test/annotator/classifier_dl/xlm_roberta_for_question_answering_test.py @@ -0,0 +1,49 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class XlmRoBertaForQuestionAnsweringTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.tested_annotator = XlmRoBertaForQuestionAnswering.pretrained() \ + .setInputCols(["document_question", "document_context"]) \ + .setOutputCol("answer") \ + .setCaseSensitive(False) + + def test_run(self): + documentAssembler = MultiDocumentAssembler() \ + .setInputCols(["question", "context"]) \ + .setOutputCols(["document_question", "document_context"]) + + questionAnswering = self.tested_annotator + + pipeline = Pipeline().setStages([ + documentAssembler, + questionAnswering + ]) + + data = SparkContextForTest.spark.createDataFrame( + [["What's my name?", "My name is Clara and I live in Berkeley."]]).toDF("question", + "context") + result = pipeline.fit(data).transform(data) + result.select("answer.result").show(truncate=False) diff --git a/python/test/annotator/classifier_dl/xlm_roberta_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/xlm_roberta_for_sequence_classification_test.py index ba6ea20976928b..a5f04774b7d7d6 100644 --- a/python/test/annotator/classifier_dl/xlm_roberta_for_sequence_classification_test.py +++ b/python/test/annotator/classifier_dl/xlm_roberta_for_sequence_classification_test.py @@ -18,26 +18,29 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class XlmRoBertaForSequenceClassificationTestSpec(unittest.TestCase): +class XlmRoBertaForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = XlmRoBertaForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - doc_classifier = XlmRoBertaForSequenceClassification \ - .pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("class") + doc_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, @@ -47,4 +50,3 @@ def runTest(self): model = pipeline.fit(self.data) model.transform(self.data).show() - diff --git a/python/test/annotator/classifier_dl/xlm_roberta_for_token_classification_test.py b/python/test/annotator/classifier_dl/xlm_roberta_for_token_classification_test.py index 9bbc9f42c8a67e..9f80c385f00261 100644 --- a/python/test/annotator/classifier_dl/xlm_roberta_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/xlm_roberta_for_token_classification_test.py @@ -18,25 +18,28 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class XlmRoBertaForTokenClassificationTestSpec(unittest.TestCase): +class XlmRoBertaForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = XlmRoBertaForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - token_classifier = XlmRoBertaForTokenClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("ner") + token_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/classifier_dl/xlnet_for_sequence_classification_test.py b/python/test/annotator/classifier_dl/xlnet_for_sequence_classification_test.py new file mode 100644 index 00000000000000..ed238d4f741490 --- /dev/null +++ b/python/test/annotator/classifier_dl/xlnet_for_sequence_classification_test.py @@ -0,0 +1,55 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class XlnetForSequenceClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = XlnetForSequenceClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + doc_classifier = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + doc_classifier + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() + + print(self.classifier.getClasses()) + print(self.classifier.getBatchSize()) diff --git a/python/test/annotator/classifier_dl/xlnet_for_token_classification_test.py b/python/test/annotator/classifier_dl/xlnet_for_token_classification_test.py index 1af60415e9bc7f..bb43beb6ec37a5 100644 --- a/python/test/annotator/classifier_dl/xlnet_for_token_classification_test.py +++ b/python/test/annotator/classifier_dl/xlnet_for_token_classification_test.py @@ -18,25 +18,28 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class XlnetForTokenClassificationTestSpec(unittest.TestCase): +class XlnetForTokenClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = XlnetForTokenClassification.pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("ner") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - token_classifier = XlnetForTokenClassification.pretrained() \ - .setInputCols(["document", "token"]) \ - .setOutputCol("ner") + token_classifier = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, @@ -46,4 +49,3 @@ def runTest(self): model = pipeline.fit(self.data) model.transform(self.data).show() - diff --git a/python/test/annotator/common/__init__.py b/python/test/annotator/common/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/python/test/annotator/common/has_max_sentence_length_test.py b/python/test/annotator/common/has_max_sentence_length_test.py new file mode 100644 index 00000000000000..b3ee336902ff9a --- /dev/null +++ b/python/test/annotator/common/has_max_sentence_length_test.py @@ -0,0 +1,30 @@ +import unittest + +import pytest + +from sparknlp.common.properties import HasMaxSentenceLengthLimit + + +class HasMaxSentenceLengthTests: + tested_annotator = None + valid_max_length = 512 + over_max_length = 5000 + + def test_max_length(self): + if not self.tested_annotator: + raise Exception("Please set the annotator to \"tested_annotator\" before running this test.") + + self.tested_annotator.setMaxSentenceLength(self.valid_max_length) + + with pytest.raises(ValueError): + self.tested_annotator.setMaxSentenceLength(self.over_max_length) + + +@pytest.mark.fast +class HasMaxSentenceLengthTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + class MockAnnotator(HasMaxSentenceLengthLimit): + def _set(self, maxSentenceLength): + pass + + self.tested_annotator = MockAnnotator() diff --git a/python/test/annotator/coref/spanbert_coref_test.py b/python/test/annotator/coref/spanbert_coref_test.py index a2c709509aab9d..674a42e6b2e2ae 100644 --- a/python/test/annotator/coref/spanbert_coref_test.py +++ b/python/test/annotator/coref/spanbert_coref_test.py @@ -11,18 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import unittest import pytest from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkSessionForTest @pytest.mark.slow -class SpanBertCorefTestSpec(unittest.TestCase): +class SpanBertCorefTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkSessionForTest.spark.createDataFrame([ @@ -35,7 +35,12 @@ def setUp(self): [" "] ]).toDF("text") - def runTest(self): + self.tested_annotator = SpanBertCorefModel() \ + .pretrained() \ + .setInputCols(["sentences", "tokens"]) \ + .setOutputCol("corefs") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") @@ -48,10 +53,7 @@ def runTest(self): .setInputCols(["sentences"]) \ .setOutputCol("tokens") - coref = SpanBertCorefModel() \ - .pretrained() \ - .setInputCols(["sentences", "tokens"]) \ - .setOutputCol("corefs") + coref = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/embeddings/albert_embeddings_test.py b/python/test/annotator/embeddings/albert_embeddings_test.py index dc69cec2ef5db6..ae41895dfdf4dd 100644 --- a/python/test/annotator/embeddings/albert_embeddings_test.py +++ b/python/test/annotator/embeddings/albert_embeddings_test.py @@ -18,17 +18,22 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class AlbertEmbeddingsTestSpec(unittest.TestCase): +class AlbertEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = AlbertEmbeddings.pretrained() \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") @@ -38,9 +43,7 @@ def runTest(self): tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") - albert = AlbertEmbeddings.pretrained() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") + albert = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/embeddings/bert_embeddings_test.py b/python/test/annotator/embeddings/bert_embeddings_test.py index 25d6afa4527b11..ef024f55e10161 100644 --- a/python/test/annotator/embeddings/bert_embeddings_test.py +++ b/python/test/annotator/embeddings/bert_embeddings_test.py @@ -18,17 +18,22 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class BertEmbeddingsTestSpec(unittest.TestCase): +class BertEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") - def runTest(self): + self.tested_annotator = BertEmbeddings.pretrained() \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") @@ -38,9 +43,7 @@ def runTest(self): tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") - albert = BertEmbeddings.pretrained() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") + albert = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/embeddings/bert_sentence_embeddings_test.py b/python/test/annotator/embeddings/bert_sentence_embeddings_test.py new file mode 100644 index 00000000000000..5c3938884ccc2d --- /dev/null +++ b/python/test/annotator/embeddings/bert_sentence_embeddings_test.py @@ -0,0 +1,48 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class BertSentenceEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + + self.tested_annotator = BertSentenceEmbeddings.pretrained() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence_embeddings") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + sentence_embeddings = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + sentence_embeddings + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() diff --git a/python/test/annotator/embeddings/camembert_embeddings_test.py b/python/test/annotator/embeddings/camembert_embeddings_test.py index 1553a27bd7ba47..a9795b7c68c550 100644 --- a/python/test/annotator/embeddings/camembert_embeddings_test.py +++ b/python/test/annotator/embeddings/camembert_embeddings_test.py @@ -18,25 +18,27 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class CamemBertEmbeddingsTestSpec(unittest.TestCase): +class CamemBertEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = CamemBertEmbeddings.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("camembert_embeddings") - def runTest(self): + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") - embeddings = CamemBertEmbeddings.pretrained() \ - .setInputCols(["token", "document"]) \ - .setOutputCol("camembert_embeddings") + embeddings = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, diff --git a/python/test/annotator/embeddings/deberta_embeddings_test.py b/python/test/annotator/embeddings/deberta_embeddings_test.py new file mode 100644 index 00000000000000..5da0bff34eb582 --- /dev/null +++ b/python/test/annotator/embeddings/deberta_embeddings_test.py @@ -0,0 +1,50 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class DeBertaEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = DeBertaEmbeddings.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("camembert_embeddings") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + embeddings = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + embeddings + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() diff --git a/python/test/annotator/embeddings/distilbert_embeddings_test.py b/python/test/annotator/embeddings/distilbert_embeddings_test.py new file mode 100644 index 00000000000000..e6a61cb2bdd2ff --- /dev/null +++ b/python/test/annotator/embeddings/distilbert_embeddings_test.py @@ -0,0 +1,50 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class DistilBertEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = DistilBertEmbeddings.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("camembert_embeddings") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + embeddings = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + embeddings + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() diff --git a/python/test/annotator/embeddings/longformer_embeddings_test.py b/python/test/annotator/embeddings/longformer_embeddings_test.py new file mode 100644 index 00000000000000..c1325ccba684b8 --- /dev/null +++ b/python/test/annotator/embeddings/longformer_embeddings_test.py @@ -0,0 +1,50 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class LongformerEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = LongformerEmbeddings.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("Longformer_embeddings") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + embeddings = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + embeddings + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() diff --git a/python/test/annotator/embeddings/roberta_embeddings_test.py b/python/test/annotator/embeddings/roberta_embeddings_test.py new file mode 100644 index 00000000000000..b6636caea801e2 --- /dev/null +++ b/python/test/annotator/embeddings/roberta_embeddings_test.py @@ -0,0 +1,50 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class RoBertaEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = RoBertaEmbeddings.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("RoBerta_embeddings") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + embeddings = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + embeddings + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() diff --git a/python/test/annotator/embeddings/roberta_sentence_embeddings_test.py b/python/test/annotator/embeddings/roberta_sentence_embeddings_test.py index 35ccb14748e5a5..e0a6cd57572d66 100644 --- a/python/test/annotator/embeddings/roberta_sentence_embeddings_test.py +++ b/python/test/annotator/embeddings/roberta_sentence_embeddings_test.py @@ -18,23 +18,25 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class RoBertaSentenceEmbeddingsTestSpec(unittest.TestCase): +class RoBertaSentenceEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = RoBertaSentenceEmbeddings.pretrained() \ + .setInputCols(["document"]) \ + .setOutputCol("sentence_embeddings") - def runTest(self): + def test_run(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") - sentence_embeddings = RoBertaSentenceEmbeddings.pretrained() \ - .setInputCols(["document"]) \ - .setOutputCol("sentence_embeddings") + sentence_embeddings = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, @@ -43,4 +45,3 @@ def runTest(self): model = pipeline.fit(self.data) model.transform(self.data).show() - diff --git a/python/test/annotator/embeddings/xlm_roberta_embeddings_test.py b/python/test/annotator/embeddings/xlm_roberta_embeddings_test.py new file mode 100644 index 00000000000000..39164d16d9344a --- /dev/null +++ b/python/test/annotator/embeddings/xlm_roberta_embeddings_test.py @@ -0,0 +1,50 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests +from test.util import SparkContextForTest + + +@pytest.mark.slow +class XlmRoBertaEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): + def setUp(self): + self.data = SparkContextForTest.spark.read.option("header", "true") \ + .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = XlmRoBertaEmbeddings.pretrained() \ + .setInputCols(["token", "document"]) \ + .setOutputCol("XlmRoBerta_embeddings") + + def test_run(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + embeddings = self.tested_annotator + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + embeddings + ]) + + model = pipeline.fit(self.data) + model.transform(self.data).show() diff --git a/python/test/annotator/embeddings/xlnet_embeddings_test.py b/python/test/annotator/embeddings/xlnet_embeddings_test.py index 707b3045e13ff6..6041a34cf18735 100644 --- a/python/test/annotator/embeddings/xlnet_embeddings_test.py +++ b/python/test/annotator/embeddings/xlnet_embeddings_test.py @@ -18,14 +18,18 @@ from sparknlp.annotator import * from sparknlp.base import * +from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests from test.util import SparkContextForTest @pytest.mark.slow -class XlnetEmbeddingsTestSpec(unittest.TestCase): +class XlnetEmbeddingsTestSpec(unittest.TestCase, HasMaxSentenceLengthTests): def setUp(self): self.data = SparkContextForTest.spark.read.option("header", "true") \ .csv(path="file:///" + os.getcwd() + "/../src/test/resources/embeddings/sentence_embeddings.csv") + self.tested_annotator = XlnetEmbeddings.pretrained() \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") def runTest(self): document_assembler = DocumentAssembler() \ @@ -37,9 +41,7 @@ def runTest(self): tokenizer = Tokenizer() \ .setInputCols(["sentence"]) \ .setOutputCol("token") - xlnet = XlnetEmbeddings.pretrained() \ - .setInputCols(["sentence", "token"]) \ - .setOutputCol("embeddings") + xlnet = self.tested_annotator pipeline = Pipeline(stages=[ document_assembler, @@ -50,4 +52,3 @@ def runTest(self): model = pipeline.fit(self.data) model.transform(self.data).show() - From 713be47bb938408dd71973e20d1e60550b73f621 Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Mon, 1 May 2023 17:13:22 +0200 Subject: [PATCH 07/32] SPARKNLP-797: Introduce Protected Features - Added setProtected method for annotator features, so that they are only settable once - Added test to verify this behavior --- .../nlp/annotators/audio/Wav2Vec2ForCTC.scala | 4 +-- .../dl/AlbertForQuestionAnswering.scala | 2 +- .../dl/AlbertForSequenceClassification.scala | 4 +-- .../dl/AlbertForTokenClassification.scala | 4 +-- .../dl/BertForQuestionAnswering.scala | 4 +-- .../dl/BertForSequenceClassification.scala | 6 ++-- .../dl/BertForTokenClassification.scala | 6 ++-- .../dl/BertForZeroShotClassification.scala | 6 ++-- .../dl/CamemBertForQuestionAnswering.scala | 2 +- .../CamemBertForSequenceClassification.scala | 4 +-- .../dl/CamemBertForTokenClassification.scala | 4 +-- .../dl/DeBertaForQuestionAnswering.scala | 2 +- .../dl/DeBertaForSequenceClassification.scala | 4 +-- .../dl/DeBertaForTokenClassification.scala | 4 +-- .../dl/DistilBertForQuestionAnswering.scala | 4 +-- .../DistilBertForSequenceClassification.scala | 6 ++-- .../dl/DistilBertForTokenClassification.scala | 7 +++-- .../DistilBertForZeroShotClassification.scala | 6 ++-- .../dl/LongformerForQuestionAnswering.scala | 6 ++-- .../LongformerForSequenceClassification.scala | 8 ++--- .../dl/LongformerForTokenClassification.scala | 8 ++--- .../dl/RoBertaForQuestionAnswering.scala | 6 ++-- .../dl/RoBertaForSequenceClassification.scala | 8 ++--- .../dl/RoBertaForTokenClassification.scala | 8 ++--- .../dl/XlmRoBertaForQuestionAnswering.scala | 2 +- .../XlmRoBertaForSequenceClassification.scala | 4 +-- .../dl/XlmRoBertaForTokenClassification.scala | 4 +-- .../dl/XlnetForSequenceClassification.scala | 4 +-- .../dl/XlnetForTokenClassification.scala | 4 +-- .../annotators/coref/SpanBertCorefModel.scala | 4 +-- .../cv/ViTForImageClassification.scala | 4 +-- .../annotators/seq2seq/BartTransformer.scala | 6 ++-- .../annotators/seq2seq/GPT2Transformer.scala | 4 +-- .../seq2seq/MarianTransformer.scala | 2 +- .../annotators/seq2seq/T5Transformer.scala | 2 +- .../nlp/embeddings/AlbertEmbeddings.scala | 2 +- .../nlp/embeddings/BertEmbeddings.scala | 4 +-- .../embeddings/BertSentenceEmbeddings.scala | 4 +-- .../nlp/embeddings/CamemBertEmbeddings.scala | 2 +- .../nlp/embeddings/DeBertaEmbeddings.scala | 2 +- .../nlp/embeddings/DistilBertEmbeddings.scala | 4 +-- .../nlp/embeddings/LongformerEmbeddings.scala | 6 ++-- .../nlp/embeddings/RoBertaEmbeddings.scala | 6 ++-- .../RoBertaSentenceEmbeddings.scala | 6 ++-- .../nlp/embeddings/XlmRoBertaEmbeddings.scala | 2 +- .../XlmRoBertaSentenceEmbeddings.scala | 2 +- .../nlp/embeddings/XlnetEmbeddings.scala | 2 +- .../nlp/serialization/Feature.scala | 20 ++++++++++++- .../nlp/HasFeaturesTestSpec.scala | 29 +++++++++++++++++++ 49 files changed, 151 insertions(+), 103 deletions(-) create mode 100644 src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala index 2096c7b9a54dab..d2e9501b43869d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/audio/Wav2Vec2ForCTC.scala @@ -175,7 +175,7 @@ class Wav2Vec2ForCTC(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, BigInt] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, BigInt] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, BigInt]): this.type = set(vocabulary, value) @@ -184,7 +184,7 @@ class Wav2Vec2ForCTC(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala index d7ed75d23d54a3..807818368f2eb1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala @@ -179,7 +179,7 @@ class AlbertForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala index 8e7891edcd248b..e4687275e0e238 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala @@ -151,7 +151,7 @@ class AlbertForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -222,7 +222,7 @@ class AlbertForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala index 5002bf961d7022..f8cc58645e4b92 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala @@ -149,7 +149,7 @@ class AlbertForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -200,7 +200,7 @@ class AlbertForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala index 548cfcc051941a..a09601f8869546 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForQuestionAnswering.scala @@ -147,7 +147,7 @@ class BertForQuestionAnswering(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -193,7 +193,7 @@ class BertForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala index 19bad2649823f8..281f919f035dcc 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForSequenceClassification.scala @@ -158,7 +158,7 @@ class BertForSequenceClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -167,7 +167,7 @@ class BertForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -238,7 +238,7 @@ class BertForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala index acf680cf377fac..dbbc7119b89b03 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForTokenClassification.scala @@ -152,7 +152,7 @@ class BertForTokenClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -161,7 +161,7 @@ class BertForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -212,7 +212,7 @@ class BertForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala index 00d89cd237f2ad..2aa383ff1df6ea 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/BertForZeroShotClassification.scala @@ -163,7 +163,7 @@ class BertForZeroShotClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = { @@ -176,7 +176,7 @@ class BertForZeroShotClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = { @@ -251,7 +251,7 @@ class BertForZeroShotClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala index eb18dd20ce2fad..dc1ca33498eb3a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala @@ -179,7 +179,7 @@ class CamemBertForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala index a71292e0ab50de..dbbc39acc072ad 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala @@ -151,7 +151,7 @@ class CamemBertForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -222,7 +222,7 @@ class CamemBertForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala index fca3ebc9b701c4..87ecc194509bf5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala @@ -149,7 +149,7 @@ class CamemBertForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -200,7 +200,7 @@ class CamemBertForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala index e5733e89324284..8528a9fed11ab4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala @@ -179,7 +179,7 @@ class DeBertaForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala index d398cbbc6bc69f..4680428e167a7c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala @@ -151,7 +151,7 @@ class DeBertaForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -221,7 +221,7 @@ class DeBertaForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala index d2e095415b81cb..ba37e969ecc38b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala @@ -150,7 +150,7 @@ class DeBertaForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -201,7 +201,7 @@ class DeBertaForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala index daf3cd2711446d..30f51af6cc2390 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForQuestionAnswering.scala @@ -146,7 +146,7 @@ class DistilBertForQuestionAnswering(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -192,7 +192,7 @@ class DistilBertForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala index 5770da79b8b16c..bcb7f91f94e541 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForSequenceClassification.scala @@ -154,7 +154,7 @@ class DistilBertForSequenceClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -163,7 +163,7 @@ class DistilBertForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -234,7 +234,7 @@ class DistilBertForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala index ea0fdb602417bc..c2de49344c35f8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForTokenClassification.scala @@ -152,7 +152,7 @@ class DistilBertForTokenClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -161,7 +161,7 @@ class DistilBertForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -212,7 +212,8 @@ class DistilBertForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala index 790597ae89b912..66229423f3674f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DistilBertForZeroShotClassification.scala @@ -160,7 +160,7 @@ class DistilBertForZeroShotClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = { @@ -173,7 +173,7 @@ class DistilBertForZeroShotClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = { @@ -248,7 +248,7 @@ class DistilBertForZeroShotClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala index 99ab07e7cbd2e4..7812d1985c054b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForQuestionAnswering.scala @@ -148,7 +148,7 @@ class LongformerForQuestionAnswering(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -157,7 +157,7 @@ class LongformerForQuestionAnswering(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -203,7 +203,7 @@ class LongformerForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala index a273a1223d6428..1c47b8ab8b81c8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForSequenceClassification.scala @@ -156,7 +156,7 @@ class LongformerForSequenceClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -165,7 +165,7 @@ class LongformerForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -179,7 +179,7 @@ class LongformerForSequenceClassification(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -245,7 +245,7 @@ class LongformerForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala index ac1178fe25d959..eaedb9dd4f111c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/LongformerForTokenClassification.scala @@ -154,7 +154,7 @@ class LongformerForTokenClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -163,7 +163,7 @@ class LongformerForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -177,7 +177,7 @@ class LongformerForTokenClassification(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -223,7 +223,7 @@ class LongformerForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala index e364fdc668ed7c..b56de867b47a90 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForQuestionAnswering.scala @@ -148,7 +148,7 @@ class RoBertaForQuestionAnswering(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -157,7 +157,7 @@ class RoBertaForQuestionAnswering(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -203,7 +203,7 @@ class RoBertaForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala index d1cc5b9f9942f3..373b9ca2b6f230 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForSequenceClassification.scala @@ -156,7 +156,7 @@ class RoBertaForSequenceClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -165,7 +165,7 @@ class RoBertaForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -179,7 +179,7 @@ class RoBertaForSequenceClassification(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -245,7 +245,7 @@ class RoBertaForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala index 6c1a84c295949c..204600c19137d4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForTokenClassification.scala @@ -154,7 +154,7 @@ class RoBertaForTokenClassification(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -163,7 +163,7 @@ class RoBertaForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -177,7 +177,7 @@ class RoBertaForTokenClassification(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -223,7 +223,7 @@ class RoBertaForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala index f4b66e299ed1a9..5ff4c890a1cfad 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala @@ -179,7 +179,7 @@ class XlmRoBertaForQuestionAnswering(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala index 4376e0cd3197f0..6322e5fd1e528c 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala @@ -151,7 +151,7 @@ class XlmRoBertaForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -221,7 +221,7 @@ class XlmRoBertaForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala index 7b599172e1a1f7..bb1ebc61ece6ae 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala @@ -149,7 +149,7 @@ class XlmRoBertaForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -200,7 +200,7 @@ class XlmRoBertaForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala index 970c110462036f..6f89cdbb3842b1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForSequenceClassification.scala @@ -151,7 +151,7 @@ class XlnetForSequenceClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -222,7 +222,7 @@ class XlnetForSequenceClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala index 724da33ca0416f..7a22482e5eb4d5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlnetForTokenClassification.scala @@ -149,7 +149,7 @@ class XlnetForTokenClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, Int]): this.type = set(labels, value) @@ -200,7 +200,7 @@ class XlnetForTokenClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala index 92767d359dc5ca..023fd737744996 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/coref/SpanBertCorefModel.scala @@ -155,7 +155,7 @@ class SpanBertCorefModel(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -201,7 +201,7 @@ class SpanBertCorefModel(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala index 07d979ed0a7425..995928424ca000 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/ViTForImageClassification.scala @@ -195,7 +195,7 @@ class ViTForImageClassification(override val uid: String) * * @group param */ - val labels: MapFeature[String, BigInt] = new MapFeature(this, "labels") + val labels: MapFeature[String, BigInt] = new MapFeature(this, "labels").setProtected() /** @group setParam */ def setLabels(value: Map[String, BigInt]): this.type = set(labels, value) @@ -209,7 +209,7 @@ class ViTForImageClassification(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala index 4afd0796a99d08..83d273fa5949cf 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala @@ -396,7 +396,7 @@ class BartTransformer(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { @@ -413,7 +413,7 @@ class BartTransformer(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -422,7 +422,7 @@ class BartTransformer(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala index 5cafbe4865d5ee..246c6e8a6f10ac 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/GPT2Transformer.scala @@ -380,7 +380,7 @@ class GPT2Transformer(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -389,7 +389,7 @@ class GPT2Transformer(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala index 381648c91b1c1d..a38554d658d05f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala @@ -277,7 +277,7 @@ class MarianTransformer(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala index 2643730c5190f7..aa1808481ba48e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/T5Transformer.scala @@ -382,7 +382,7 @@ class T5Transformer(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala index 77ffe01ca200d4..054f6ccbb3e2c8 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala @@ -251,7 +251,7 @@ class AlbertEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala index d5f5c1c29b66b3..b1aa6cda0a9b4d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertEmbeddings.scala @@ -178,7 +178,7 @@ class BertEmbeddings(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -224,7 +224,7 @@ class BertEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala index 89b8e7cc192319..a2b1c9726ccf9d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BertSentenceEmbeddings.scala @@ -163,7 +163,7 @@ class BertSentenceEmbeddings(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** ConfigProto from tensorflow, serialized into byte array. Get with * config_proto.SerializeToString() @@ -291,7 +291,7 @@ class BertSentenceEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala index 70864b1197fd35..0d052504b963c4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala @@ -187,7 +187,7 @@ class CamemBertEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala index b0480e888f149c..2ca53cfa6bd01b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DeBertaEmbeddings.scala @@ -232,7 +232,7 @@ class DeBertaEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala index f611156aa7a26f..490c26af568002 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/DistilBertEmbeddings.scala @@ -183,7 +183,7 @@ class DistilBertEmbeddings(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -229,7 +229,7 @@ class DistilBertEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala index fcd536ddefce40..a540c2456fee4f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/LongformerEmbeddings.scala @@ -175,7 +175,7 @@ class LongformerEmbeddings(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -184,7 +184,7 @@ class LongformerEmbeddings(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -230,7 +230,7 @@ class LongformerEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala index 5cab98a4e1c4a9..b0ff510d32b850 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaEmbeddings.scala @@ -188,7 +188,7 @@ class RoBertaEmbeddings(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -197,7 +197,7 @@ class RoBertaEmbeddings(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -243,7 +243,7 @@ class RoBertaEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala index 80310063ada687..dae739cb2706d3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/RoBertaSentenceEmbeddings.scala @@ -185,7 +185,7 @@ class RoBertaSentenceEmbeddings(override val uid: String) * * @group param */ - val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() /** @group setParam */ def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) @@ -194,7 +194,7 @@ class RoBertaSentenceEmbeddings(override val uid: String) * * @group param */ - val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected() /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) @@ -240,7 +240,7 @@ class RoBertaSentenceEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala index c707857737f1f9..074e4617fe22a9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala @@ -218,7 +218,7 @@ class XlmRoBertaEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala index 05a9ffc3844562..3bbf59b49da296 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala @@ -215,7 +215,7 @@ class XlmRoBertaSentenceEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala index adfb783eaedd9e..a6606966ed0f18 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlnetEmbeddings.scala @@ -251,7 +251,7 @@ class XlnetEmbeddings(override val uid: String) * * @group param */ - val signatures = new MapFeature[String, String](model = this, name = "signatures") + val signatures = new MapFeature[String, String](model = this, name = "signatures").setProtected() /** @group setParam */ def setSignatures(value: Map[String, String]): this.type = { diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala index f7120226c45137..2c915e040ae577 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala @@ -33,7 +33,7 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag]( extends Serializable { model.features.append(this) - private val spark = ResourceHelper.spark + private val spark: SparkSession = ResourceHelper.spark val serializationMode: String = ConfigLoader.getConfigStringValue(ConfigHelper.serializationMode) @@ -44,6 +44,7 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag]( final protected var fallbackRawValue: Option[TComplete] = None final protected var fallbackLazyValue: Option[() => TComplete] = None + final protected var isProtected: Boolean = false final def serialize( spark: SparkSession, @@ -117,6 +118,13 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag]( } final def setValue(value: Option[Any]): HasFeatures = { + if (isProtected && isSet) + throw new IllegalArgumentException( + "Trying to set a protected parameter, which was already set." + + " The parameter you are trying to set is protected and can only be set once." + + " For a pretrained model, this was done during the initialization process." + + " If you are trying to train your own model, please check the documentation.") + if (useBroadcast) { if (isSet) broadcastValue.get.destroy() broadcastValue = @@ -136,6 +144,16 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag]( broadcastValue.isDefined || rawValue.isDefined } + /** Sets this feature to be protected and only settable once. + * + * @return + * This Feature + */ + final def setProtected(): this.type = { + isProtected = true + this + } + } class StructFeature[TValue: ClassTag](model: HasFeatures, override val name: String) diff --git a/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala new file mode 100644 index 00000000000000..6a154bf9a350a6 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala @@ -0,0 +1,29 @@ +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.serialization.StructFeature +import com.johnsnowlabs.tags.FastTest +import org.scalatest.flatspec.AnyFlatSpec + +class HasFeaturesTestSpec extends AnyFlatSpec { + class MockModel extends HasFeatures { + private val protectedMockFeature = + new StructFeature[String](this, "mockFeature").setProtected() + def setProtectedMockFeature(value: String): this.type = set(protectedMockFeature, value) + def getProtectedMockFeature: String = $$(protectedMockFeature) + + } + + val model = new MockModel + + behavior of "HasFeaturesModels" + + it should "set protected params only once" taggedAs FastTest in { + model.setProtectedMockFeature("first") + assert(model.getProtectedMockFeature == "first") + + assertThrows[IllegalArgumentException] { + model.setProtectedMockFeature("second") + } + } + +} From ab50ae68b6c5278bb7cf5a9f9d25af00227cafc7 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Thu, 4 May 2023 12:37:21 +0000 Subject: [PATCH 08/32] Added code to support caching in the bart model Removed noBeamsearch and related methods --- .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 601 +++--------------- 1 file changed, 104 insertions(+), 497 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index 680cfc0bf5b62d..37ec506d8c7cde 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -55,7 +55,8 @@ private[johnsnowlabs] class Bart( configProtoBytes: Option[Array[Byte]] = None, signatures: Option[Map[String, String]] = None, merges: Map[(String, String), Int], - vocabulary: Map[String, Int]) + vocabulary: Map[String, Int], + useCache: Boolean = false) extends Serializable with Generate { @@ -69,6 +70,30 @@ private[johnsnowlabs] class Bart( private val paddingTokenId = 1 private val eosTokenId = 2 private val vocab_size = 50264 + private val encoderInputIdsKey = "encoder_encoder_input_ids:0" + private val encoderAttentionMaskKey = "encoder_encoder_attention_mask:0" + private val encoderOutputKey = "StatefulPartitionedCall_2:0" + + private val decoderInitInputIdsKey = "decoder_init_decoder_input_ids:0" + private val decoderInitEncoderAttentionMaskKey = "decoder_init_encoder_attention_mask:0" + private val decoderInitEncoderStateKey = "decoder_init_encoder_state:0" + + private val decoderInitOutputLogitsKey = "StatefulPartitionedCall_1:2" + private val decoderInitOutputCache1Key = "StatefulPartitionedCall_1:0" + private val decoderInitOutputCache2Key = "StatefulPartitionedCall_1:1" + + private val decoderCachedInputIdsKey = "decoder_cached_decoder_input_ids:0" + private val decoderCachedEncoderAttentionMaskKey = "decoder_cached_encoder_attention_mask:0" + private val decoderCachedEncoderStateKey = "decoder_cached_encoder_state:0" + private val decoderCachedCache1Key = "decoder_cached_cache1:0" + private val decoderCachedCache2Key = "decoder_cached_cache2:0" + + private val decoderCachedOutputLogitsKey = "StatefulPartitionedCall:2" + private val decoderCachedOutputCache1Key = "StatefulPartitionedCall:0" + private val decoderCachedOutputCache2Key = "StatefulPartitionedCall:1" + private var nextStateTensor1: Option[org.tensorflow.Tensor] = None + private var nextStateTensor2: Option[org.tensorflow.Tensor] = None + var tensorDecoder = new TensorResources() private def sessionWarmup(): Unit = { val dummyInput = Array.fill(1)(0) ++ Array(eosTokenId) @@ -151,20 +176,12 @@ private[johnsnowlabs] class Bart( val encoderAttentionMaskTensors = tensorEncoder.createIntBufferTensor(shape, encoderAttentionMaskBuffers) - val runner = session.runner + val runner = session.runner; + runner - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.EncoderInputIds.key, - "missing_encoder_input_ids"), - encoderInputTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.EncoderAttentionMask.key, - "missing_encoder_attention_mask"), - encoderAttentionMaskTensors) - .fetch(_tfBartSignatures - .getOrElse(ModelSignatureConstants.EncoderOutput.key, "missing_last_hidden_state")) + .feed(encoderInputIdsKey, encoderInputTensors) + .feed(encoderAttentionMaskKey, encoderAttentionMaskTensors) + .fetch(encoderOutputKey) val encoderOuts = runner.run().asScala val encoderOutsFloats = TensorResources.extractFloats(encoderOuts.head) @@ -191,22 +208,6 @@ private[johnsnowlabs] class Bart( Array(expandedEncoderInputIdsVals.length, maxSentenceLength, dim), decoderEncoderStateBuffers) -// val modelOutputs = generateNoBeamSearch( -// batch, -// decoderEncoderStateTensors, -// encoderAttentionMaskTensors, -// maxOutputLength, -// minOutputLength, -// doSample, -// temperature, -// topK, -// topP, -// repetitionPenalty, -// noRepeatNgramSize, -// randomSeed, -// ignoreTokenIds, -// session) - val modelOutputs = generateBeamSearch( batch, decoderEncoderStateTensors, @@ -231,8 +232,18 @@ private[johnsnowlabs] class Bart( decoderEncoderStateTensors.close() encoderAttentionMaskTensors.close() encoderInputTensors.close() + if (useCache) { + tensorDecoder.clearTensors() + nextStateTensor1 match { + case Some(t) => t.close() + case None => + } + nextStateTensor2 match { + case Some(t) => t.close() + case None => + } + } modelOutputs - } def generateBeamSearch( @@ -296,416 +307,6 @@ private[johnsnowlabs] class Bart( session) } - def generateNoBeamSearch( - inputIds: Seq[Array[Int]], - decoderEncoderStateTensors: Tensor, - encoderAttentionMaskTensors: Tensor, - maxOutputLength: Int, - minOutputLength: Int, - doSample: Boolean, - temperature: Double, - topK: Int, - topP: Double, - repetitionPenalty: Double, - noRepeatNgramSize: Int, - randomSeed: Option[Long], - ignoreTokenIds: Array[Int] = Array(), - session: Session): Array[Array[Int]] = { - - /** Generate sequences for each example without beam search (numBeams == 1). All returned - * sequence are generated independently. - */ - - /* Actual size of each sentence to skip padding in the TF model */ - val sequencesLength = inputIds.map(x => x.length).toArray - val maxSentenceLength = sequencesLength.max // - curLen - - val numReturn_sequences = 1 - // from config - var effectiveBatch_size = 1 - var effectiveBatch_mult = 1 - - // set effective batch size and effective batch multiplier according to do_sample - if (doSample) { - effectiveBatch_size = inputIds.length * numReturn_sequences - effectiveBatch_mult = numReturn_sequences - } else { - effectiveBatch_size = inputIds.length - effectiveBatch_mult = 1 - } - - var decoderInputs = inputIds.map(_ => Array(this.eosTokenId)).toArray - - val batch_size = effectiveBatch_size - var curLen = decoderInputs(0).length - - var stopDecoder = false - - // length of generated sentences / unfinished sentences - var unfinishedSents = List.fill(decoderInputs.length)(1) - var sentLengths = List.fill(decoderInputs.length)(maxOutputLength) - - while (!stopDecoder) { - val decoderInputLength = decoderInputs.head.length - val tensorDecoder = new TensorResources() - - val decoderInputBuffers = - tensorDecoder.createIntBuffer(decoderInputs.length * decoderInputLength) - val decoderAttentionBuffers = - tensorDecoder.createIntBuffer(decoderInputs.length * decoderInputLength) - - decoderInputs.zipWithIndex.foreach { case (pieceIds, idx) => - val offset = idx * decoderInputLength - decoderInputBuffers.offset(offset).write(pieceIds) - val paddingMasks = pieceIds.map(_ => 1) - decoderAttentionBuffers.offset(offset).write(paddingMasks) - } - - val decoderInputTensors = tensorDecoder.createIntBufferTensor( - Array(decoderInputs.length, decoderInputLength), - decoderInputBuffers) - val decoderAttentionMaskTensors = tensorDecoder.createIntBufferTensor( - Array(decoderInputs.length, decoderInputLength), - decoderAttentionBuffers) - - val runner = session.runner - - // TODO add past to the model and use cache - runner - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderInputIds.key, - "missing_decoder_input_ids"), - decoderInputTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderAttentionMask.key, - "missing_encoder_attention_mask"), - decoderAttentionMaskTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderEncoderInputIds.key, - "missing_encoder_state"), - decoderEncoderStateTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderEncoderAttentionMask.key, - "missing_decoder_encoder_attention_mask"), - encoderAttentionMaskTensors) - .fetch(_tfBartSignatures - .getOrElse(ModelSignatureConstants.DecoderOutput.key, "missing_output_0")) - - val decoderOuts = runner.run().asScala - val decoderOutputs = TensorResources - .extractFloats(decoderOuts.head) - .grouped(vocab_size) - .toArray - .grouped(decoderInputLength) - .toArray - var nextTokenLogits = for (decoderOutput <- decoderOutputs) yield decoderOutput.last - - nextTokenLogits = nextTokenLogits.map(logits => { - logits.indices - .map(i => { - if (ignoreTokenIds.contains(i)) Float.MinValue else logits(i) - }) - .toArray - }) - - // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) - if (repetitionPenalty != 1.0) { - nextTokenLogits = - createNextTokenLogitsPenalties(decoderInputs, nextTokenLogits, repetitionPenalty) - } - - if (noRepeatNgramSize > 0) { - // calculate a list of banned tokens to prevent repetitively generating the same ngrams - // from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - val bannedTokens = - calcBannedNgramTokens(decoderInputs, batch_size, noRepeatNgramSize, curLen) - // create bannedTokens boolean mask - var bannedTokensIndicesMask = Array.empty[IndexedSeq[Boolean]] - for (bannedTokensSlice <- bannedTokens) { - bannedTokensIndicesMask = bannedTokensIndicesMask :+ - (for (token <- 0 until vocab_size) - yield if (bannedTokensSlice.contains(token)) true else false) - } - if (!bannedTokensIndicesMask.isEmpty) - nextTokenLogits = - for ((nextTokenLogit, bannedTokensIndexMask) <- nextTokenLogits.zip( - bannedTokensIndicesMask)) - yield setTensorByIndicesToValue( - nextTokenLogit, - bannedTokensIndexMask, - Float.NegativeInfinity) - } - - // set eos token prob to zero if minLength is not reached - if (!eosTokenId.isNaN && curLen < minOutputLength) { - // create eosTokenId boolean mask - val isTokenLogit_eosToken = - for (token <- 0 until vocab_size) - yield if (token == eosTokenId) true else false - - val eosTokenIndices_mask = Array.fill(batch_size)(isTokenLogit_eosToken) - - nextTokenLogits = - for ((nextTokenLogit, bannedTokensIndex_mask) <- nextTokenLogits.zip( - eosTokenIndices_mask)) - yield setTensorByIndicesToValue( - nextTokenLogit, - bannedTokensIndex_mask, - Float.NegativeInfinity) - } - - var nextToken = Array.ofDim[Int](decoderInputs.length) - - if (doSample) { - // Temperature (higher temperature => more likely to sample low probability tokens) - if (temperature != 1.0) - nextTokenLogits = - for (nextTokenLogit <- nextTokenLogits) - yield nextTokenLogit.map(_ / temperature.toFloat) - // Top-p/top-k filtering - nextTokenLogits = topKTopPFiltering(nextTokenLogits, topK, topP) - // Sample - nextToken = nextTokenLogits.map(input => categoricalSample(input, randomSeed)) - } else { - // Greedy decoding - nextToken = nextTokenLogits.map(input => input.indexOf(input.max)) - } - var tokensToAdd = Array.ofDim[Int](decoderInputs.length) - - // update generations and finished sentences - if (!eosTokenId.isNaN) - // pad finished sentences if eos_token_id exist - tokensToAdd = - nextToken.zip(unfinishedSents).map(x => x._1 * x._2 + paddingTokenId * (1 - x._2)) - else - tokensToAdd = nextToken - - decoderInputs = decoderInputs - .zip(tokensToAdd) - .map(x => { - x._1 ++ Array(x._2) - }) - decoderOuts.foreach(_.close()) - - curLen += 1 - - if (!eosTokenId.isNaN) { - val eosInSents = tokensToAdd.zipWithIndex.map { case (x, ind) => - if (x == eosTokenId && ind != 0) 1 else 0 - } - // if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length - val isSentsUnfinishedAndTokenToAddIsEos = - unfinishedSents.zip(eosInSents).map(x => x._1 * x._2) - - sentLengths = sentLengths - .zip(isSentsUnfinishedAndTokenToAddIsEos) - .map(x => x._1 * (1 - x._2) + curLen * x._2) - - // unfinishedSents is set to zero if eos in sentence - unfinishedSents = - unfinishedSents.zip(isSentsUnfinishedAndTokenToAddIsEos).map(x => x._1 - x._2) - } - - tensorDecoder.clearTensors() - tensorDecoder.clearSession(decoderOuts) - decoderInputTensors.close() - - // stop when there is a eos in each sentence, or if we exceed the maximum length - // stopDecoder = curLen < maxOutputLength || unfinishedSents.max == 0 - stopDecoder = (!decoderInputs.exists(o => !(o.count(_ == this.eosTokenId) == 2)) - || (decoderInputs.head.length > maxOutputLength)) - - } -// tensorEncoder.clearTensors() - decoderInputs - } - - def createNextTokenLogitsPenalties( - inputIds: Seq[Array[Int]], - logits: Array[Array[Float]], - repetitionPenalty: Double): Array[Array[Float]] = { - // create logit penalties for already seen inputIds - val nextTokenLogits = Array.ofDim[Array[Float]](logits.length) - - for (i <- logits.indices) { - var nextTokenLogit = logits(i) - val prevInputIds = inputIds.head.distinct - for ((prevInputId, _) <- prevInputIds.zipWithIndex) { - var logitPenalty = 1.0 - if (logits(i)(prevInputId.toInt) < 0) { - logitPenalty = repetitionPenalty - } else { - logitPenalty = 1 / repetitionPenalty - } - nextTokenLogit = nextTokenLogit.updated( - prevInputId.toInt, - (logitPenalty * nextTokenLogit(prevInputId.toInt)).toFloat) - } - nextTokenLogits(i) = nextTokenLogit - } - nextTokenLogits - } - - private def calcBannedNgramTokens( - prevInputIds: Seq[Array[Int]], - numHypos: Int, - noRepeatNgramSize: Int, - curLen: Int): Array[Array[Int]] = { - // based on fairseq for noRepeatNgram in beam_search - if (curLen + 1 < noRepeatNgramSize) - // return no banned tokens if we haven't generated noRepeatNgram_size tokens yet - return Array.ofDim[Int](numHypos, 0) - val generatedNgrams = - Array.tabulate(numHypos)(_ => mutable.Map.empty[IndexedSeq[Int], List[Int]]) - for (idx <- 0 until numHypos) { - val genTokens = prevInputIds(idx) - val generatedNgram = generatedNgrams(idx) - val ngramArrays = for (e <- 0 until noRepeatNgramSize) yield genTokens.drop(e) - for (ngramInd <- ngramArrays.last.indices) { - val ngram = for (e <- ngramArrays) yield e(ngramInd) - val prevNgramTuple = ngram.dropRight(1) - generatedNgram(prevNgramTuple) = - generatedNgram.getOrElse(prevNgramTuple, List.empty[Int]) :+ ngram.last - } - } - (for (hypoIdx <- 0 until numHypos) - yield getGeneratedNgrams( - prevInputIds, - generatedNgrams, - hypoIdx, - curLen, - noRepeatNgramSize)).toArray - } - - def getGeneratedNgrams( - prevInputIds: Seq[Array[Int]], - generatedNgrams: Array[mutable.Map[IndexedSeq[Int], List[Int]]], - hypoIdx: Int, - curLen: Int, - noRepeatNgramSize: Int): Array[Int] = { - // Before decoding the next token, prevent decoding of ngrams that have already appeared - val startIdx = curLen + 1 - noRepeatNgramSize - val ngramIdx = prevInputIds(hypoIdx).slice(startIdx, curLen) - generatedNgrams(hypoIdx).getOrElse(ngramIdx, List.empty[Int]).toArray - } - - private def topKTopPFiltering( - logits: Array[Array[Float]], - topK: Int, - topP: Double, - filterValue: Float = Float.NegativeInfinity, - minTokensToKeep: Int = 1): Array[Array[Float]] = { - - /** Filter a distribution of logits using top-k and/or nucleus (top-p) filtering * Args: - * logits: logits distribution shape (batch size, vocabulary size) if topK > 0: keep only top - * k tokens with highest probability (top-k filtering). if topP < 1.0: keep the top tokens - * with cumulative probability >= topP (nucleus filtering). Nucleus filtering is described in - * Holtzman et al. (http://arxiv.org/abs/1904.09751) Make sure we keep at least - * minTokensToKeep per batch example in the output From: - * https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - */ - var logitsUpd = logits - val logitsShape = Array(logits.length, logits(0).length) - - if (topK > 0) { - val topKup = topK.max(minTokensToKeep).min(logitsShape.last) // Safety check - - /** Remove all tokens with a probability less than the last token of the top-k */ - val removeLimit = logits(0).sortWith(_ > _).take(topKup).min - val indicesToRemove = - for (logit <- logits) - yield for (elem <- logit) yield if (elem < removeLimit) true else false - - logitsUpd = - for ((nextTokenLogit, indexToRemove) <- logits.zip(indicesToRemove)) - yield setTensorByIndicesToValue(nextTokenLogit, indexToRemove, Float.NegativeInfinity) - } - if (topP < 1.0) { - val (sortedLogits, sortedIndices) = logits(0).zipWithIndex.sorted.reverse.unzip - - val cumulativeProbs = scanLeft(softmax(sortedLogits))(0.0)(_ + _).drop(1) - - /** Remove tokens with cumulative probability above the threshold (token with 0 are kept) */ - var sortedIndicesToRemove = - for (prob <- cumulativeProbs) - yield if (prob > topP) true else false - - if (minTokensToKeep > 1) { - - /** Keep at least minTokensToKeep (set to minTokensToKeep-1 because we add the first one - * below) - */ - sortedIndicesToRemove = List.fill(sortedIndicesToRemove.take(minTokensToKeep).length)( - false) ++ sortedIndicesToRemove.drop(minTokensToKeep) - } - - /** Shift the indices to the right to keep also the first token above the threshold */ - sortedIndicesToRemove = sortedIndicesToRemove.takeRight(1) ++ sortedIndicesToRemove - .dropRight(1) - sortedIndicesToRemove = - List.fill(sortedIndicesToRemove.take(1).length)(false) ++ sortedIndicesToRemove - .drop(1) - - /** scatter sorted tensors to original indexing */ - val indicesToRemove = scatterValuesOnBatchIndices(sortedIndicesToRemove, sortedIndices) - logitsUpd = - for ((nextTokenLogit, indexToRemove) <- logits.zip( - IndexedSeq.fill(logits.length)(indicesToRemove))) - yield setTensorByIndicesToValue( - nextTokenLogit, - indexToRemove.toIndexedSeq, - Float.NegativeInfinity) - } - logitsUpd - } - - private def scanLeft[a, b](xs: Iterable[a])(s: b)(f: (b, a) => b) = - xs.foldLeft(List(s))((acc, x) => f(acc.head, x) :: acc).reverse - - private def scatterValuesOnBatchIndices( - values: List[Boolean], - batchIndices: Array[Int]): List[Boolean] = { - // scatter values to pair indices - val (_, initArray) = batchIndices.zip(values).sorted.unzip - initArray.toList - } - - private def setTensorByIndicesToValue( - prevInputIds: Array[Float], - indices: IndexedSeq[Boolean], - value: Float): Array[Float] = { - for ((inputId, index) <- prevInputIds.zip(indices)) yield if (index) value else inputId - } - - private def categoricalSample(dist: Array[Float], randomSeed: Option[Long]): Int = { - val (distFiltered, indices) = - dist.zipWithIndex.filter { case (elem, index) => !elem.isInfinite }.sorted.unzip - - if (distFiltered.length == 1) - return indices(0) - - val normalized = softmax(distFiltered) - - var randomDouble = 0.0 - if (randomSeed.isDefined) - randomDouble = new scala.util.Random(randomSeed.get).nextDouble() - else - randomDouble = scala.util.Random.nextDouble() - - var accum = 0.0 - for ((itemProb, i) <- normalized.zip(indices)) { - accum += itemProb - if (accum >= randomDouble) { - return i - } - } - indices(0) - } - def decode(sentences: Array[Array[Int]]): Seq[String] = { sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt))) } @@ -716,12 +317,6 @@ private[johnsnowlabs] class Bart( .map(s => { val sentWithTask = if (task.nonEmpty) s -// new Sentence( -// content = task.concat(" ").concat(s.content), -// start = s.start, -// end = s.end + task.length + 1, -// index = s.index, -// metadata = s.metadata) else s bpeTokenizer .tokenize(sentWithTask) @@ -787,73 +382,85 @@ private[johnsnowlabs] class Bart( encoderAttentionMaskTensors: Tensor, maxLength: Int, session: Session): Array[Array[Float]] = { + val sequencesLength = encoderInputIds.map(x => x.length).toArray var maxSentenceLength = sequencesLength.max // - curLen maxSentenceLength = Math.max(maxSentenceLength, maxLength) - val vocab_size = this.vocab_size - + val vocabSize = this.vocab_size val decoderInputLength = decoderInputIds.head.length - val tensorDecoder = new TensorResources() + val batchSize = encoderInputIds.length + val useLastIdOnly = useCache && (decoderInputLength > 0) + val sequenceLength = if (useLastIdOnly) 1 else decoderInputLength + if (!useCache) { + tensorDecoder = new TensorResources() + } val decoderInputBuffers = - tensorDecoder.createIntBuffer(decoderInputIds.length * decoderInputLength) - val decoderAttentionBuffers = - tensorDecoder.createIntBuffer(decoderInputIds.length * decoderInputLength) + tensorDecoder.createIntBuffer(decoderInputIds.length * sequenceLength) decoderInputIds.zipWithIndex.foreach { case (pieceIds, idx) => - val offset = idx * decoderInputLength - decoderInputBuffers.offset(offset).write(pieceIds) - val paddingMasks = pieceIds.map(_ => 1) - decoderAttentionBuffers.offset(offset).write(paddingMasks) + val offset = idx * sequenceLength + decoderInputBuffers + .offset(offset) + .write(if (useLastIdOnly) pieceIds.takeRight(1) else pieceIds) } val decoderInputTensors = tensorDecoder.createIntBufferTensor( - Array(decoderInputIds.length, decoderInputLength), + Array(decoderInputIds.length, sequenceLength), decoderInputBuffers) - val decoderAttentionMaskTensors = tensorDecoder.createIntBufferTensor( - Array(decoderInputIds.length, decoderInputLength), - decoderAttentionBuffers) - val runner = session.runner + val runner = if (nextStateTensor1.isEmpty || nextStateTensor2.isEmpty) { + val r = session.runner + .feed(decoderInitInputIdsKey, decoderInputTensors) + .feed(decoderInitEncoderStateKey, decoderEncoderStateTensors) + .feed(decoderInitEncoderAttentionMaskKey, encoderAttentionMaskTensors) + .fetch(decoderInitOutputLogitsKey) - // TODO add past to the model and use cache - runner - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderInputIds.key, - "missing_decoder_input_ids"), - decoderInputTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderAttentionMask.key, - "missing_encoder_attention_mask"), - decoderAttentionMaskTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderEncoderInputIds.key, - "missing_encoder_state"), - decoderEncoderStateTensors) - .feed( - _tfBartSignatures.getOrElse( - ModelSignatureConstants.DecoderEncoderAttentionMask.key, - "missing_decoder_encoder_attention_mask"), - encoderAttentionMaskTensors) - .fetch(_tfBartSignatures - .getOrElse(ModelSignatureConstants.DecoderOutput.key, "missing_output_0")) + if (!useCache) + r + else + r + .fetch(decoderInitOutputCache1Key) + .fetch(decoderInitOutputCache2Key) + } else { + session.runner + .feed(decoderCachedInputIdsKey, decoderInputTensors) + .feed(decoderCachedEncoderStateKey, decoderEncoderStateTensors) + .feed(decoderCachedEncoderAttentionMaskKey, encoderAttentionMaskTensors) + .feed(decoderCachedCache1Key, nextStateTensor1.get) + .feed(decoderCachedCache2Key, nextStateTensor2.get) + .fetch(decoderCachedOutputLogitsKey) + .fetch(decoderCachedOutputCache1Key) + .fetch(decoderCachedOutputCache2Key) + } val decoderOuts = runner.run().asScala - val decoderOutputs = TensorResources - .extractFloats(decoderOuts.head) - .grouped(vocab_size) - .toArray - .grouped(decoderInputLength) - .toArray - val nextTokenLogits = for (decoderOutput <- decoderOutputs) yield decoderOutput.last - decoderOuts.foreach(_.close()) - tensorDecoder.clearTensors() - tensorDecoder.clearSession(decoderOuts) + val logitsRaw = TensorResources.extractFloats(decoderOuts.head) + decoderOuts.head.close() + val decoderOutputs = (0 until batchSize).map(i => { + logitsRaw + .slice( + i * sequenceLength * vocabSize + (sequenceLength - 1) * vocabSize, + i * sequenceLength * vocabSize + sequenceLength * vocabSize) + }) + + if (useCache) { + if (nextStateTensor1.isDefined) { + nextStateTensor1.get.close() + } + if (nextStateTensor2.isDefined) { + nextStateTensor2.get.close() + } + nextStateTensor1 = Some(decoderOuts(1).asRawTensor()) + nextStateTensor2 = Some(decoderOuts(2).asRawTensor()) + } + + val nextTokenLogits = decoderOutputs.toArray + if (!useCache) { + tensorDecoder.clearSession(decoderOuts) + tensorDecoder.clearTensors() + } decoderInputTensors.close() - decoderAttentionMaskTensors.close() nextTokenLogits } } From 27c6319718e4bec0975e3613153bd3a36cd48544 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Thu, 4 May 2023 12:37:45 +0000 Subject: [PATCH 09/32] Added code to support caching in the BartTransformer --- .../annotators/seq2seq/BartTransformer.scala | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala index 4afd0796a99d08..7a906346680697 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTransformer.scala @@ -427,10 +427,27 @@ class BartTransformer(override val uid: String) /** @group setParam */ def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) + /** Cache internal state of the model to improve performance + * + * @group param + */ + val useCache = + new BooleanParam(parent = this, name = "useCache", doc = "Cache internal state of the model") + + protected def setUseCache(value: Boolean): BartTransformer.this.type = { + set(useCache, value) + this + } + + def getUseCache: Boolean = $(useCache) + /** @group setParam */ - def setModelIfNotSet(spark: SparkSession, tfWrapper: TensorflowWrapper): this.type = { + def setModelIfNotSet( + spark: SparkSession, + tfWrapper: TensorflowWrapper, + useCache: Boolean): this.type = { if (_tfModel.isEmpty) { - + setUseCache(useCache) _tfModel = Some( spark.sparkContext.broadcast( new Bart( @@ -438,7 +455,8 @@ class BartTransformer(override val uid: String) configProtoBytes = getConfigProtoBytes, signatures = getSignatures, $$(merges), - $$(vocabulary)))) + $$(vocabulary), + useCache = useCache))) } this } @@ -458,7 +476,8 @@ class BartTransformer(override val uid: String) noRepeatNgramSize -> 0, ignoreTokenIds -> Array(), batchSize -> 1, - beamSize -> 4) + beamSize -> 4, + useCache -> true) override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { @@ -548,12 +567,15 @@ trait ReadBartTransformerDLModel extends ReadTensorflowModel { "_bart_tf", savedSignatures = instance.getSignatures, initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.setModelIfNotSet(spark, tf, instance.getUseCache) } addReader(readModel) - def loadSavedModel(modelPath: String, spark: SparkSession): BartTransformer = { + def loadSavedModel( + modelPath: String, + spark: SparkSession, + useCache: Boolean = true): BartTransformer = { val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) @@ -592,7 +614,7 @@ trait ReadBartTransformerDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, wrapper, useCache) case _ => throw new Exception(notSupportedEngineError) From 2448292d0136dbaa7d14fa7c76ca8c2f9f23bf52 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Thu, 4 May 2023 12:37:56 +0000 Subject: [PATCH 10/32] Added python API to support caching in the BartTransformer --- .../annotator/seq2seq/bart_transformer.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/python/sparknlp/annotator/seq2seq/bart_transformer.py b/python/sparknlp/annotator/seq2seq/bart_transformer.py index 5c04a372919756..e3c2b0fc80f872 100755 --- a/python/sparknlp/annotator/seq2seq/bart_transformer.py +++ b/python/sparknlp/annotator/seq2seq/bart_transformer.py @@ -95,7 +95,8 @@ class BartTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): If set to int > 0, all ngrams of that size can only occur once, by default `0`. ignoreTokenIds A list of token ids which are ignored in the decoder's output, by default `[]`. - + useCache + Whether or not to use cache, by default `False`. Notes ----- This is a very computationally expensive module especially on larger @@ -209,6 +210,8 @@ class BartTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): "The Number of beams for beam search.", typeConverter=TypeConverters.toInt) + useCache = Param(Params._dummy(), "useCache", "Use caching to enhance performance", typeConverter=TypeConverters.toBoolean) + def setIgnoreTokenIds(self, value): """A list of token ids which are ignored in the decoder's output, by default `[]`. @@ -340,6 +343,16 @@ def setBeamSize(self, value): """ return self._set(beamSize=value) + def setCache(self, value): + """Sets whether or not to use caching to enhance performance, by default `False`. + + Parameters + ---------- + value : bool + Whether or not to use caching to enhance performance + """ + return self._set(useCache=value) + @keyword_only def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.BartTransformer", java_model=None): super(BartTransformer, self).__init__( @@ -358,11 +371,12 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.BartTransf noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1, - beamSize=4 + beamSize=4, + useCache=False, ) @staticmethod - def loadSavedModel(folder, spark_session): + def loadSavedModel(folder, spark_session, use_cache): """Loads a locally saved model. Parameters @@ -371,6 +385,8 @@ def loadSavedModel(folder, spark_session): Folder of the saved model spark_session : pyspark.sql.SparkSession The current SparkSession + use_cache: bool + The model uses caching to facilitate performance Returns ------- @@ -378,7 +394,7 @@ def loadSavedModel(folder, spark_session): The restored model """ from sparknlp.internal import _BartLoader - jModel = _BartLoader(folder, spark_session._jsparkSession)._java_obj + jModel = _BartLoader(folder, spark_session._jsparkSession, use_cache)._java_obj return BartTransformer(java_model=jModel) @staticmethod From 763d96ce972d3f826d72a3c0e8d2b4ef5c2bfa53 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Thu, 4 May 2023 12:55:09 -0500 Subject: [PATCH 11/32] SPARKNLP-832 Fix MultiDateMatcher bug that didn't return multiple dates --- .../nlp/annotators/DateMatcherUtils.scala | 2 +- .../nlp/annotators/MultiDateMatcher.scala | 11 +++++++++-- .../com/johnsnowlabs/nlp/util/regex/RuleFactory.scala | 6 ++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala index 3f3095f1c51031..392bc45d2475fe 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala @@ -326,7 +326,7 @@ trait DateMatcherUtils extends Params { * first only. any other matches discarded Auto completes short versions of months. Any two * digit year is considered to be XX century */ - protected val relaxedFactory: RuleFactory = new RuleFactory(MatchStrategy.MATCH_FIRST) + protected val relaxedFactory: RuleFactory = new RuleFactory(MatchStrategy.MATCH_ALL) .addRule(relaxedDayNumbered, "relaxed days") .addRule(relaxedMonths.r, "relaxed months exclusive") .addRule(relaxedYear, "relaxed year") diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala index a2c31e0ddb82cb..3925501330700d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala @@ -247,7 +247,14 @@ class MultiDateMatcher(override val uid: String) } private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = { - val possibleDates = relaxedFactory.findMatch(text) + val possibleDates = relaxedFactory.findMatch(text) + val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch) + possibleDatesByIndexMatch.flatMap{ case (_, possibleDates) => + computePossibleDates(possibleDates) + }.toSeq + } + + private def computePossibleDates(possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = { var dayMatch = $(defaultDayWhenMissing) var monthMatch = defaultMonthWhenMissing var yearMatch = defaultYearWhenMissing @@ -256,7 +263,7 @@ class MultiDateMatcher(override val uid: String) possibleDates.foreach(possibleDate => { if (possibleDate.identifier == "relaxed days" && possibleDate.content.matched.exists( - _.isDigit)) { + _.isDigit)) { changes += 1 dayMatch = possibleDate.content.matched.filter(_.isDigit).toInt } diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala index 177f436149065c..33b0c5cd6a34b2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala @@ -69,7 +69,9 @@ class RuleFactory( matchStrategy match { case MATCH_ALL => rules.flatMap(rule => - rule.regex.findAllMatchIn(text).map(m => RuleMatch(m, rule.identifier))) + rule.regex.findAllMatchIn(text).zipWithIndex.map{ case (currentMatch, index) => + RuleMatch(currentMatch, rule.identifier, index) + }) case MATCH_FIRST => rules.flatMap(rule => rule.regex.findFirstMatchIn(text).map(m => RuleMatch(m, rule.identifier))) @@ -234,7 +236,7 @@ object RuleFactory { * @param identifier * user provided identification of a rule */ - case class RuleMatch(content: Regex.Match, identifier: String) + case class RuleMatch(content: Regex.Match, identifier: String, indexMatch: Int = -1) } /** Allowed strategies for [[RuleFactory]] applications regarding replacement */ From ace1691a98aa53fafa9f0492b6746c796370b3a6 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Fri, 5 May 2023 01:25:04 +0500 Subject: [PATCH 12/32] added RobertaForZeroShotClsasification --- python/README.md | 2 +- .../annotator/classifier_dl/__init__.py | 3 +- ...berta_bert_for_zero_shot_classification.py | 222 ++++++++ python/sparknlp/internal/__init__.py | 7 + ...berta_for_zero_shot_classification_test.py | 52 ++ .../ml/ai/RoBertaClassification.scala | 80 ++- .../com/johnsnowlabs/nlp/annotator.scala | 6 + .../dl/RoBertaForZeroShotClassification.scala | 475 ++++++++++++++++++ .../nlp/pretrained/ResourceDownloader.scala | 3 +- ...rtaForZeroShotClassificationTestSpec.scala | 186 +++++++ 10 files changed, 1031 insertions(+), 5 deletions(-) create mode 100644 python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py create mode 100644 python/test/annotator/classifier_dl/roberta_for_zero_shot_classification_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassificationTestSpec.scala diff --git a/python/README.md b/python/README.md index d8e5c09c2707e2..2aa2d36e571343 100644 --- a/python/README.md +++ b/python/README.md @@ -614,7 +614,7 @@ from [Maven Central](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp). To add JARs to spark programs use the `--jars` option: ```sh -spark-shell --jars spark-nlp.jar +spark-shell --jars sparknlp.jar ``` The preferred way to use the library when running spark programs is using the `--packages` option as specified in diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index fbf56f8c060ed8..ce4a94069d3145 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -44,4 +44,5 @@ from sparknlp.annotator.classifier_dl.camembert_for_sequence_classification import * from sparknlp.annotator.classifier_dl.camembert_for_question_answering import * from sparknlp.annotator.classifier_dl.bert_for_zero_shot_classification import * -from sparknlp.annotator.classifier_dl.distil_bert_for_zero_shot_classification import * \ No newline at end of file +from sparknlp.annotator.classifier_dl.distil_bert_for_zero_shot_classification import * +from sparknlp.annotator.classifier_dl.roberta_bert_for_zero_shot_classification import * \ No newline at end of file diff --git a/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py new file mode 100644 index 00000000000000..d48d37992ac583 --- /dev/null +++ b/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py @@ -0,0 +1,222 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for RoBertaForZeroShotClassification.""" + +from sparknlp.common import * + + +class RoBertaForZeroShotClassification(AnnotatorModel, + HasCaseSensitiveProperties, + HasBatchedAnnotate, + HasClassifierActivationProperties, + HasCandidateLabelsProperties, + HasEngine): + """RoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language + inference) tasks. Equivalent of `RoBertaForSequenceClassification` models, but these models don't require a hardcoded + number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more + flexible. + + Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis + pair and passed to the pretrained model. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> sequenceClassifier = RoBertaForZeroShotClassification.pretrained() \\ + ... .setInputCols(["token", "document"]) \\ + ... .setOutputCol("label") + + The default model is ``"roberta_base_zero_shot_classifier_nli"``, if no name is + provided. + + For available pretrained models please see the `Models Hub + `__. + + To see which models are compatible and how to import them see + `Import Transformers into Spark NLP 🚀 + `_. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT, TOKEN`` ``CATEGORY`` + ====================== ====================== + + Parameters + ---------- + batchSize + Batch size. Large values allows faster processing but requires more + memory, by default 8 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default + True + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + maxSentenceLength + Max sentence length to process, by default 128 + coalesceSentences + Instead of 1 class per sentence (if inputCols is `sentence`) output 1 + class per document by averaging probabilities in all sentences, by + default False + activation + Whether to calculate logits via Softmax or Sigmoid, by default + `"softmax"`. + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> tokenizer = Tokenizer() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("token") + >>> sequenceClassifier = RoBertaForZeroShotClassification.pretrained() \\ + ... .setInputCols(["token", "document"]) \\ + ... .setOutputCol("label") \\ + ... .setCaseSensitive(True) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... tokenizer, + ... sequenceClassifier + ... ]) + >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("label.result").show(truncate=False) + +------+ + |result| + +------+ + |[pos] | + |[neg] | + +------+ + """ + name = "RoBertaForZeroShotClassification" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + + outputAnnotatorType = AnnotatorType.CATEGORY + + maxSentenceLength = Param(Params._dummy(), + "maxSentenceLength", + "Max sentence length to process", + typeConverter=TypeConverters.toInt) + + configProtoBytes = Param(Params._dummy(), + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", + TypeConverters.toListInt) + + coalesceSentences = Param(Params._dummy(), "coalesceSentences", + "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.", + TypeConverters.toBoolean) + + def getClasses(self): + """ + Returns labels used to train this model + """ + return self._call_java("getClasses") + + def setConfigProtoBytes(self, b): + """Sets configProto from tensorflow, serialized into byte array. + + Parameters + ---------- + b : List[int] + ConfigProto from tensorflow, serialized into byte array + """ + return self._set(configProtoBytes=b) + + def setMaxSentenceLength(self, value): + """Sets max sentence length to process, by default 128. + + Parameters + ---------- + value : int + Max sentence length to process + """ + return self._set(maxSentenceLength=value) + + def setCoalesceSentences(self, value): + """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging + probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as RoBerta + (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities + for the entire document instead of probabilities per sentence. (Default: true) + + Parameters + ---------- + value : bool + If the output of all sentences will be averaged to one output + """ + return self._set(coalesceSentences=value) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForZeroShotClassification", + java_model=None): + super(RoBertaForZeroShotClassification, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + batchSize=8, + maxSentenceLength=128, + caseSensitive=True, + coalesceSentences=False, + activation="softmax" + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + RoBertaForZeroShotClassification + The restored model + """ + from sparknlp.internal import _RoBertaForZeroShotClassification + jModel = _RoBertaForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj + return RoBertaForZeroShotClassification(java_model=jModel) + + @staticmethod + def pretrained(name="roberta_base_zero_shot_classifier_nli", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default + "roberta_base_zero_shot_classifier_nli" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + RoBertaForZeroShotClassification + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(RoBertaForZeroShotClassification, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index deab0142aa9a6b..544d0b9ce02b27 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -522,3 +522,10 @@ def __init__(self, path, jspark): super(_DistilBertForZeroShotClassification, self).__init__( "com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForZeroShotClassification.loadSavedModel", path, jspark) + + +class _RoBertaForZeroShotClassification(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_RoBertaForZeroShotClassification, self).__init__( + "com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForZeroShotClassification.loadSavedModel", path, + jspark) diff --git a/python/test/annotator/classifier_dl/roberta_for_zero_shot_classification_test.py b/python/test/annotator/classifier_dl/roberta_for_zero_shot_classification_test.py new file mode 100644 index 00000000000000..31667f71560ef5 --- /dev/null +++ b/python/test/annotator/classifier_dl/roberta_for_zero_shot_classification_test.py @@ -0,0 +1,52 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class RoBertaForZeroShotClassificationTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.text = "I have a problem with my iphone that needs to be resolved asap!!" + self.inputDataset = self.spark.createDataFrame([[self.text]]) \ + .toDF("text") + + def runTest(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + zero_shot_classifier = RoBertaForZeroShotClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") \ + .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]) + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + zero_shot_classifier + ]) + + model = pipeline.fit(self.inputDataset) + model.transform(self.inputDataset).show() + light_pipeline = LightPipeline(model) + annotations_result = light_pipeline.fullAnnotate(self.text) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala index 78034175c20e2e..3e80bedef517b1 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala @@ -20,6 +20,7 @@ import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignat import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BpeTokenizer +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} import org.tensorflow.ndarray.buffer.IntDataBuffer @@ -86,8 +87,19 @@ private[johnsnowlabs] class RoBertaClassification( def tokenizeSeqString( candidateLabels: Seq[String], maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = ??? + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + val basicTokenizer = new BasicTokenizer(caseSensitive) + val encoder = new WordpieceEncoder(vocabulary, unkToken = "") + val labelsToSentences = candidateLabels.map { s => Sentence(s, 0, s.length - 1, 0) } + + labelsToSentences.map(label => { + val tokens = basicTokenizer.tokenize(label) + val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) + WordpieceTokenizedSentence(wordpieceTokens) + }) + } def tokenizeDocument( docs: Seq[Annotation], maxSeqLength: Int, @@ -241,7 +253,71 @@ private[johnsnowlabs] class RoBertaClassification( batch: Seq[Array[Int]], entailmentId: Int, contradictionId: Int, - activation: String): Array[Array[Float]] = ??? + activation: String): Array[Array[Float]] = { + val tensors = new TensorResources() + + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val batchLength = batch.length + + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex + .foreach { case (sentence, idx) => + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentence) + maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) + val sentenceEndTokenIndex = sentence.indexOf(sentenceEndTokenId) + segmentBuffers + .offset(offset) + .write( + sentence.indices + .map(i => + if (i < sentenceEndTokenIndex) 0 + else if (i == sentenceEndTokenIndex) 1 + else 1) + .toArray) + } + + val session = tensorflowWrapper.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner + + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) + + runner + .feed( + _tfRoBertaSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfRoBertaSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch(_tfRoBertaSignatures + .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) + + val outs = runner.run().asScala + val rawScores = TensorResources.extractFloats(outs.head) + + outs.foreach(_.close()) + tensors.clearSession(outs) + tensors.clearTensors() + + val dim = rawScores.length / batchLength + rawScores + .grouped(dim) + .toArray + } def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { val tensors = new TensorResources() diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index 034e2d58219f15..d41a32502c9220 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -699,4 +699,10 @@ package object annotator { extends ReadablePretrainedDistilBertForZeroShotModel with ReadDistilBertForZeroShotDLModel + type RobertaBertForZeroShotClassification = + com.johnsnowlabs.nlp.annotators.classifier.dl.RoBertaForZeroShotClassification + + object RoBertaForZeroShotClassification + extends ReadablePretrainedRoBertaForZeroShotModel + with ReadRoBertaForZeroShotDLModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala new file mode 100644 index 00000000000000..21494a964a6708 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.scala @@ -0,0 +1,475 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.ml.ai.RoBertaClassification +import com.johnsnowlabs.ml.tensorflow._ +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.ModelEngine +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.serialization.MapFeature +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param.{BooleanParam, IntArrayParam, IntParam} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** RoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI + * (natural language inference) tasks. Equivalent of `RoBertaForZeroShotClassification ` models, + * but these models don't require a hardcoded number of potential classes, they can be chosen at + * runtime. It usually means it's slower but it is much more flexible. + * + * Any combination of sequences and labels can be passed and each combination will be posed as a + * premise/hypothesis pair and passed to the pretrained model. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val sequenceClassifier = RoBertaForZeroShotClassification .pretrained() + * .setInputCols("token", "document") + * .setOutputCol("label") + * }}} + * The default model is `"roberta_base_zero_shot_classifier_nli"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?task=Text+Classification Models Hub]]. + * + * To see which models are compatible and how to import them see + * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]] and to see more extended + * examples, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification .scala RoBertaForZeroShotClassification]]. + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * + * val documentAssembler = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val tokenizer = new Tokenizer() + * .setInputCols("document") + * .setOutputCol("token") + * + * val sequenceClassifier = RoBertaForZeroShotClassification .pretrained() + * .setInputCols("token", "document") + * .setOutputCol("label") + * .setCaseSensitive(true) + * + * val pipeline = new Pipeline().setStages(Array( + * documentAssembler, + * tokenizer, + * sequenceClassifier + * )) + * + * val data = Seq("I loved this movie when I was a child.", "It was pretty boring.").toDF("text") + * val result = pipeline.fit(data).transform(data) + * + * result.select("label.result").show(false) + * +------+ + * |result| + * +------+ + * |[pos] | + * |[neg] | + * +------+ + * }}} + * + * @see + * [[RoBertaForZeroShotClassification]] for sequence-level classification + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based classifiers + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class RoBertaForZeroShotClassification(override val uid: String) + extends AnnotatorModel[RoBertaForZeroShotClassification] + with HasBatchedAnnotate[RoBertaForZeroShotClassification] + with WriteTensorflowModel + with HasCaseSensitiveProperties + with HasClassifierActivationProperties + with HasEngine + with HasCandidateLabelsProperties { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("ROBERTA_FOR_ZERO_SHOT_CLASSIFICATION")) + + /** Input Annotator Types: DOCUMENT, TOKEN + * + * @group anno + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) + + /** Output Annotator Types: CATEGORY + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = AnnotatorType.CATEGORY + + /** @group setParam */ + def sentenceStartTokenId: Int = { + $$(vocabulary)("") + } + + /** @group setParam */ + def sentenceEndTokenId: Int = { + $$(vocabulary)("") + } + + def padTokenId: Int = { + $$(vocabulary)("") + } + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary") + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = { + if (get(vocabulary).isEmpty) + set(vocabulary, value) + this + } + + /** Labels used to decode predicted IDs back to string tags + * + * @group param + */ + val labels: MapFeature[String, Int] = new MapFeature(this, "labels") + + /** @group setParam */ + def setLabels(value: Map[String, Int]): this.type = { + if (get(labels).isEmpty) + set(labels, value) + this + } + + /** Returns labels used to train this model */ + def getClasses: Array[String] = { + $$(labels).keys.toArray + } + + /** Holding merges.txt coming from RoBERTa model + * + * @group param + */ + val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges") + + /** @group setParam */ + def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value) + + /** Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document + * by averaging probabilities in all sentences (Default: `false`). + * + * Due to max sequence length limit in almost all transformer models such as RoBerta (512 + * tokens), this parameter helps feeding all the sentences into the model and averaging all the + * probabilities for the entire document instead of probabilities per sentence. + * + * @group param + */ + val coalesceSentences = new BooleanParam( + this, + "coalesceSentences", + "If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. Defaults to false.") + + /** @group setParam */ + def setCoalesceSentences(value: Boolean): this.type = set(coalesceSentences, value) + + /** @group getParam */ + def getCoalesceSentences: Boolean = $(coalesceSentences) + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * `config_proto.SerializeToString()` + * + * @group param + */ + val configProtoBytes = new IntArrayParam( + this, + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()") + + /** @group setParam */ + def setConfigProtoBytes(bytes: Array[Int]): RoBertaForZeroShotClassification.this.type = + set(this.configProtoBytes, bytes) + + /** @group getParam */ + def getConfigProtoBytes: Option[Array[Byte]] = get(this.configProtoBytes).map(_.map(_.toByte)) + + /** Max sentence length to process (Default: `128`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "RoBerta models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = new MapFeature[String, String](model = this, name = "signatures") + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + if (get(signatures).isEmpty) + set(signatures, value) + this + } + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + private var _model: Option[Broadcast[RoBertaClassification]] = None + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: TensorflowWrapper): RoBertaForZeroShotClassification = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new RoBertaClassification( + tensorflowWrapper, + sentenceStartTokenId, + sentenceEndTokenId, + padTokenId, + configProtoBytes = getConfigProtoBytes, + tags = $$(labels), + signatures = getSignatures, + $$(merges), + $$(vocabulary)))) + } + + this + } + + /** @group getParam */ + def getModelIfNotSet: RoBertaClassification = _model.get.value + + /** Whether to lowercase tokens or not (Default: `true`). + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = { + if (get(caseSensitive).isEmpty) + set(this.caseSensitive, value) + this + } + + setDefault( + batchSize -> 8, + maxSentenceLength -> 128, + caseSensitive -> true, + coalesceSentences -> false) + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + batchedAnnotations.map(annotations => { + val sentences = SentenceSplit.unpack(annotations).toArray + val tokenizedSentences = TokenizedWithSentence.unpack(annotations).toArray + + if (tokenizedSentences.nonEmpty) { + getModelIfNotSet.predictSequenceWithZeroShot( + tokenizedSentences, + sentences, + $(candidateLabels), + $(entailmentIdParam), + $(contradictionIdParam), + $(batchSize), + $(maxSentenceLength), + $(caseSensitive), + $(coalesceSentences), + $$(labels), + $(activation)) + + } else { + Seq.empty[Annotation] + } + }) + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper, + "_roberta_classification", + RoBertaForZeroShotClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + } + +} + +trait ReadablePretrainedRoBertaForZeroShotModel + extends ParamsAndFeaturesReadable[RoBertaForZeroShotClassification] + with HasPretrained[RoBertaForZeroShotClassification] { + override val defaultModelName: Some[String] = Some("roberta_base_zero_shot_classifier_nli") + + /** Java compliant-overrides */ + override def pretrained(): RoBertaForZeroShotClassification = super.pretrained() + + override def pretrained(name: String): RoBertaForZeroShotClassification = + super.pretrained(name) + + override def pretrained(name: String, lang: String): RoBertaForZeroShotClassification = + super.pretrained(name, lang) + + override def pretrained( + name: String, + lang: String, + remoteLoc: String): RoBertaForZeroShotClassification = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadRoBertaForZeroShotDLModel extends ReadTensorflowModel { + this: ParamsAndFeaturesReadable[RoBertaForZeroShotClassification] => + + override val tfFile: String = "roberta_classification_tensorflow" + + def readModel( + instance: RoBertaForZeroShotClassification, + path: String, + spark: SparkSession): Unit = { + + val tf = + readTensorflowModel(path, spark, "_roberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, tf) + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): RoBertaForZeroShotClassification = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + val labels = loadTextAsset(localModelPath, "labels.txt").zipWithIndex.toMap + val bytePairs = loadTextAsset(localModelPath, "merges.txt") + .map(_.split(" ")) + .filter(w => w.length == 2) + .map { case Array(c1, c2) => (c1, c2) } + .zipWithIndex + .toMap + + val entailmentIds = labels.filter(x => x._1.toLowerCase().startsWith("entail")).values.toArray + val contradictionIds = + labels.filter(x => x._1.toLowerCase().startsWith("contradict")).values.toArray + + require( + entailmentIds.length == 1 && contradictionIds.length == 1, + s"""This annotator supports classifiers trained on NLI datasets. You must have only at least 2 or maximum 3 labels in your dataset: + + example with 3 labels: 'contradict', 'neutral', 'entailment' + example with 2 labels: 'contradict', 'entailment' + + You can modify assets/labels.txt file to match the above format. + + Current labels: ${labels.keys.mkString(", ")} + """) + + val annotatorModel = new RoBertaForZeroShotClassification() + .setVocabulary(vocabs) + .setLabels(labels) + .setMerges(bytePairs) + .setCandidateLabels(labels.keys.toArray) + + /* set the entailment id */ + annotatorModel.set(annotatorModel.entailmentIdParam, entailmentIds.head) + /* set the contradiction id */ + annotatorModel.set(annotatorModel.contradictionIdParam, contradictionIds.head) + /* set the engine */ + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case ModelEngine.tensorflow => + val (wrapper, signatures) = + TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) + + val _signatures = signatures match { + case Some(s) => s + case None => throw new Exception("Cannot load signature definitions from model!") + } + + /** the order of setSignatures is important if we use getSignatures inside + * setModelIfNotSet + */ + annotatorModel + .setSignatures(_signatures) + .setModelIfNotSet(spark, wrapper) + + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[RoBertaForZeroShotClassification]]. Please refer to that + * class for the documentation. + */ +object RoBertaForZeroShotClassification + extends ReadablePretrainedRoBertaForZeroShotModel + with ReadRoBertaForZeroShotDLModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index e786e3010d869c..88175340eee72a 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -728,7 +728,8 @@ object PythonResourceDownloader { "ZeroShotNerModel" -> ZeroShotNerModel, "BartTransformer" -> BartTransformer, "BertForZeroShotClassification" -> BertForZeroShotClassification, - "DistilBertForZeroShotClassification" -> DistilBertForZeroShotClassification) + "DistilBertForZeroShotClassification" -> DistilBertForZeroShotClassification, + "RoBertaForZeroShotClassification" -> RoBertaForZeroShotClassification) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassificationTestSpec.scala new file mode 100644 index 00000000000000..c13721e74006f1 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassificationTestSpec.scala @@ -0,0 +1,186 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.nlp.annotators.Tokenizer +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.training.CoNLL +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import com.johnsnowlabs.util.Benchmark +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.sql.functions.{col, explode, size} +import org.scalatest.flatspec.AnyFlatSpec + +class RoBertaForZeroShotClassificationTestSpec extends AnyFlatSpec { + + import ResourceHelper.spark.implicits._ + + val candidateLabels = + Array("urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology") + + "RoBertaForZeroShotClassification" should "correctly load custom model with extracted signatures" taggedAs SlowTest in { + + val ddd = Seq( + "I have a problem with my iphone that needs to be resolved asap!!", + "Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.", + "I have a phone and I love it!", + "I really want to visit Germany and I am planning to go there next year.", + "Let's watch some movies tonight! I am in the mood for a horror movie.", + "Have you watched the match yesterday? It was a great game!", + "We need to harry up and get to the airport. We are going to miss our flight!") + .toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val tokenClassifier = RoBertaForZeroShotClassification + .pretrained() + .setInputCols(Array("token", "document")) + .setOutputCol("multi_class") + .setCaseSensitive(true) + .setCoalesceSentences(true) + .setCandidateLabels(candidateLabels) + + val pipeline = new Pipeline().setStages(Array(document, tokenizer, tokenClassifier)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + + pipelineDF.select("multi_class").show(20, false) + pipelineDF.select("document.result", "multi_class.result").show(20, false) + pipelineDF + .withColumn("doc_size", size(col("document"))) + .withColumn("label_size", size(col("multi_class"))) + .where(col("doc_size") =!= col("label_size")) + .select("doc_size", "label_size", "document.result", "multi_class.result") + .show(20, false) + + val totalDocs = pipelineDF.select(explode($"document.result")).count.toInt + val totalLabels = pipelineDF.select(explode($"multi_class.result")).count.toInt + + println(s"total tokens: $totalDocs") + println(s"total embeddings: $totalLabels") + + assert(totalDocs == totalLabels) + } + + "RoBertaForZeroShotClassification" should "be saved and loaded correctly" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val ddd = Seq( + "John Lenon was born in London and lived in Paris. My name is Sarah and I live in London", + "Rare Hendrix song draft sells for almost $17,000.", + "EU rejects German call to boycott British lamb .", + "TORONTO 1996-08-21").toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val tokenClassifier = RoBertaForZeroShotClassification + .pretrained() + .setInputCols(Array("token", "document")) + .setOutputCol("label") + .setCaseSensitive(true) + .setCoalesceSentences(true) + .setCandidateLabels(candidateLabels) + .setBatchSize(2) + + val pipeline = new Pipeline().setStages(Array(document, tokenizer, tokenClassifier)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + + pipelineDF.select("label.result").show(false) + + Benchmark.time("Time to save RoBertaForZeroShotClassification pipeline model") { + pipelineModel.write.overwrite().save("./tmp_robertafornli_pipeline") + } + + Benchmark.time("Time to save RoBertaForZeroShotClassification model") { + pipelineModel.stages.last + .asInstanceOf[RoBertaForZeroShotClassification] + .write + .overwrite() + .save("./tmp_robertafornli_model") + } + + val loadedPipelineModel = PipelineModel.load("./tmp_robertafornli_pipeline") + loadedPipelineModel.transform(ddd).select("label.result").show(false) + + val loadedSequenceModel = + RoBertaForZeroShotClassification.load("./tmp_robertafornli_model") + println(loadedSequenceModel.getClasses.mkString("Array(", ", ", ")")) + + } + + "RoBertaForZeroShotClassification" should "benchmark test" taggedAs SlowTest in { + + val conll = CoNLL(explodeSentences = false) + val training_data = + conll + .readDataset(ResourceHelper.spark, "src/test/resources/conll2003/eng.train") + .repartition(12) + + val tokenClassifier = RoBertaForZeroShotClassification + .pretrained() + .setInputCols(Array("token", "sentence")) + .setOutputCol("class") + .setCaseSensitive(true) + .setCoalesceSentences(true) + .setCandidateLabels(candidateLabels) + .setBatchSize(2) + + val pipeline = new Pipeline() + .setStages(Array(tokenClassifier)) + + val pipelineDF = pipeline.fit(training_data).transform(training_data).cache() + Benchmark.time("Time to save pipeline results") { + pipelineDF.write.mode("overwrite").parquet("./tmp_nli_classifier") + } + + pipelineDF.select("class").show(2, false) + pipelineDF.select("sentence.result", "class.result").show(2, false) + + // only works if it's softmax - one lablel per row + pipelineDF + .withColumn("doc_size", size(col("sentence"))) + .withColumn("label_size", size(col("class"))) + .where(col("doc_size") =!= col("label_size")) + .select("doc_size", "label_size", "sentence.result", "class.result") + .show(20, false) + + val totalDocs = pipelineDF.select(explode($"sentence.result")).count.toInt + val totalLabels = pipelineDF.select(explode($"class.result")).count.toInt + + println(s"total docs: $totalDocs") + println(s"total classes: $totalLabels") + + assert(totalDocs == totalLabels) + } +} From 9b03304ccea526e865ea866f38f0353b67ba3f53 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Fri, 5 May 2023 01:26:46 +0500 Subject: [PATCH 13/32] fixing typo --- python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index 2aa2d36e571343..d8e5c09c2707e2 100644 --- a/python/README.md +++ b/python/README.md @@ -614,7 +614,7 @@ from [Maven Central](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp). To add JARs to spark programs use the `--jars` option: ```sh -spark-shell --jars sparknlp.jar +spark-shell --jars spark-nlp.jar ``` The preferred way to use the library when running spark programs is using the `--packages` option as specified in From d30652d8048b9d85b252a5f898b0e0b8667ca07e Mon Sep 17 00:00:00 2001 From: Devin Ha Date: Fri, 5 May 2023 11:30:14 +0200 Subject: [PATCH 14/32] Fix loadSavedModel for DBFS --- .../scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala index ebf5087ec17df4..60d2852181af17 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/ResourceHelper.scala @@ -106,7 +106,7 @@ object ResourceHelper { case class SourceStream(resource: String) { var fileSystem: Option[FileSystem] = None - private val (pathExists, path) = OutputHelper.doesPathExists(resource) + private val (pathExists: Boolean, path: Option[Path]) = OutputHelper.doesPathExists(resource) if (!pathExists) { throw new FileNotFoundException(s"file or folder: $resource not found") } else { @@ -156,7 +156,7 @@ object ResourceHelper { Paths.get(destination.toString, path.get.getName).toUri else destination.toUri case "dbfs" => - val dbfsPath = path.toString.replace("dbfs:/", "/dbfs/") + val dbfsPath = path.get.toString.replace("dbfs:/", "/dbfs/") val sourceFile = new File(dbfsPath) val targetFile = new File(destination.toString) if (sourceFile.isFile) FileUtils.copyFileToDirectory(sourceFile, targetFile) From d0703f9d41a1aa8a1dccefef4bd812b4a77f3e0d Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 6 May 2023 11:52:41 +0000 Subject: [PATCH 15/32] Updated python API to support caching in the BartTransformer --- python/sparknlp/annotator/seq2seq/bart_transformer.py | 2 +- python/sparknlp/internal/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/sparknlp/annotator/seq2seq/bart_transformer.py b/python/sparknlp/annotator/seq2seq/bart_transformer.py index e3c2b0fc80f872..56f3719190a209 100755 --- a/python/sparknlp/annotator/seq2seq/bart_transformer.py +++ b/python/sparknlp/annotator/seq2seq/bart_transformer.py @@ -376,7 +376,7 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.BartTransf ) @staticmethod - def loadSavedModel(folder, spark_session, use_cache): + def loadSavedModel(folder, spark_session, use_cache=False): """Loads a locally saved model. Parameters diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index deab0142aa9a6b..beca04aa76cbda 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -221,9 +221,9 @@ def __init__(self, path, jspark): class _BartLoader(ExtendedJavaWrapper): - def __init__(self, path, jspark): + def __init__(self, path, jspark, useCache): super(_BartLoader, self).__init__( - "com.johnsnowlabs.nlp.annotators.seq2seq.BartTransformer.loadSavedModel", path, jspark) + "com.johnsnowlabs.nlp.annotators.seq2seq.BartTransformer.loadSavedModel", path, jspark, useCache) class _USELoader(ExtendedJavaWrapper): From 25737b7b5aa389c5ad8df507b4dd1c71b4255b33 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 6 May 2023 12:13:10 +0000 Subject: [PATCH 16/32] Updated python API to support caching in the BartTransformer --- python/sparknlp/annotator/seq2seq/bart_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sparknlp/annotator/seq2seq/bart_transformer.py b/python/sparknlp/annotator/seq2seq/bart_transformer.py index 56f3719190a209..c9db8f039b43fe 100755 --- a/python/sparknlp/annotator/seq2seq/bart_transformer.py +++ b/python/sparknlp/annotator/seq2seq/bart_transformer.py @@ -44,7 +44,7 @@ class BartTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): Pretrained models can be loaded with :meth:`.pretrained` of the companion object: - >>> t5 = BartTransformer.pretrained() \\ + >>> bart = BartTransformer.pretrained() \\ ... .setTask("summarize:") \\ ... .setInputCols(["document"]) \\ ... .setOutputCol("summaries") From beca5624ce3ef14ccfdcb4b8946014f920667939 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 6 May 2023 14:23:21 +0000 Subject: [PATCH 17/32] Updated API to support caching in the BartTransformer --- src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index 37ec506d8c7cde..ac42466d5a8bbf 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -234,14 +234,6 @@ private[johnsnowlabs] class Bart( encoderInputTensors.close() if (useCache) { tensorDecoder.clearTensors() - nextStateTensor1 match { - case Some(t) => t.close() - case None => - } - nextStateTensor2 match { - case Some(t) => t.close() - case None => - } } modelOutputs } From fd3a03f18a911a31eff5504f6a5dc253cf6c808f Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Sat, 6 May 2023 15:04:39 +0000 Subject: [PATCH 18/32] Updated API to support caching in the BartTransformer --- src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index ac42466d5a8bbf..fa50bddb24a3c7 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -234,6 +234,9 @@ private[johnsnowlabs] class Bart( encoderInputTensors.close() if (useCache) { tensorDecoder.clearTensors() + nextStateTensor1 = None + nextStateTensor2 = None + } modelOutputs } From a3c3061fd937a229b2c8347902f905bc3555b4c9 Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Sun, 7 May 2023 08:30:32 -0500 Subject: [PATCH 19/32] SPARKNLP-832 Extracting MatchStrategy as a single object and adding fix relaxedFactoryStrategy parameter --- .../annotator/matcher/date_matcher.py | 18 +++++++ python/sparknlp/common/__init__.py | 1 + python/sparknlp/common/match_strategy.py | 33 ++++++++++++ .../matcher/multi_date_matcher_test.py | 50 +++++++++++++++++++ .../nlp/annotators/DateMatcherUtils.scala | 38 +++++++++++--- .../nlp/annotators/MultiDateMatcher.scala | 15 +++--- .../nlp/annotators/RegexMatcherModel.scala | 6 +-- .../nlp/annotators/Tokenizer.scala | 4 +- .../pragmatic/PragmaticContentFormatter.scala | 2 +- .../sbd/pragmatic/PragmaticMethod.scala | 2 +- .../nlp/util/io/MatchStrategy.scala | 37 ++++++++++++++ .../nlp/util/regex/RuleFactory.scala | 13 ++--- .../annotators/MultiDateMatcherTestSpec.scala | 28 +++++++++++ 13 files changed, 218 insertions(+), 29 deletions(-) create mode 100644 python/sparknlp/common/match_strategy.py create mode 100644 python/test/annotator/matcher/multi_date_matcher_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/util/io/MatchStrategy.scala diff --git a/python/sparknlp/annotator/matcher/date_matcher.py b/python/sparknlp/annotator/matcher/date_matcher.py index 55c38010448295..8bf9583eef6884 100755 --- a/python/sparknlp/annotator/matcher/date_matcher.py +++ b/python/sparknlp/annotator/matcher/date_matcher.py @@ -67,6 +67,11 @@ class DateMatcherUtils(Params): "source language for explicit translation", typeConverter=TypeConverters.toString) + relaxedFactoryStrategy = Param(Params._dummy(), + "relaxedFactoryStrategy", + "Matched Strategy to searches relaxed dates", + typeConverter=TypeConverters.toString) + def setInputFormats(self, value): """Sets input formats patterns to match in the documents. @@ -159,6 +164,19 @@ def setAnchorDateDay(self, value): """ return self._set(anchorDateDay=value) + def setRelaxedFactoryStrategy(self, matchStrategy=MatchStrategy.MATCH_FIRST): + """ Sets matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy. + + Not all of the date information needs to be included. For example + ``"YYYY"`` is also a valid input. + + Parameters + ---------- + matchStrategy : MatchStrategy + Matched strategy to search relaxed dates by ordered rules by more exhaustive to less Strategy + """ + return self._set(relaxedFactoryStrategy=matchStrategy) + class DateMatcher(AnnotatorModel, DateMatcherUtils): """Matches standard date formats into a provided format diff --git a/python/sparknlp/common/__init__.py b/python/sparknlp/common/__init__.py index 8a83c082e82516..f5cdcb45079544 100644 --- a/python/sparknlp/common/__init__.py +++ b/python/sparknlp/common/__init__.py @@ -22,3 +22,4 @@ from sparknlp.common.storage import * from sparknlp.common.utils import * from sparknlp.common.annotator_type import * +from sparknlp.common.match_strategy import * diff --git a/python/sparknlp/common/match_strategy.py b/python/sparknlp/common/match_strategy.py new file mode 100644 index 00000000000000..1fa455aeaf2588 --- /dev/null +++ b/python/sparknlp/common/match_strategy.py @@ -0,0 +1,33 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Allowed strategies for RuleFactory applications regarding replacement""" + + +class MatchStrategy(object): + """Object that contains constants for how for matched strategies used in RuleFactory. + + Possible values are: + + ================================== =============================================================================== + Value Description + ================================== =============================================================================== + ``MatchStrategy.MATCH_ALL`` This strategy matches all occurrences of all rules in the given text. + ``MatchStrategy.MATCH_FIRST`` This strategy matches only the first occurrence of each rule in the given text. + ``MatchStrategy.MATCH_COMPLETE`` This strategy matches only the first occurrence of each rule in the given text. + ================================== =============================================================================== + """ + MATCH_ALL = "MATCH_ALL" + MATCH_FIRST = "MATCH_FIRST" + MATCH_COMPLETE = "MATCH_COMPLETE" diff --git a/python/test/annotator/matcher/multi_date_matcher_test.py b/python/test/annotator/matcher/multi_date_matcher_test.py new file mode 100644 index 00000000000000..a7fe4636fa30dc --- /dev/null +++ b/python/test/annotator/matcher/multi_date_matcher_test.py @@ -0,0 +1,50 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from pyspark.sql.functions import size +from test.util import SparkContextForTest + + +@pytest.mark.fast +class MultiDateMatcherTestSpec(unittest.TestCase): + + def setUp(self): + text = """ + Lease Period Monthly Installment of Base Rent. + January 1, 2021 –December 31, 2021 $20,304.85 . + January 1, 2022 –December 31, 2022 $20,914.00 . + """ + self.data = SparkContextForTest.spark.createDataFrame([[text]]).toDF("text") + + def runTest(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + date_matcher = MultiDateMatcher() \ + .setInputCols(["document"]) \ + .setOutputCol("date") \ + .setOutputFormat("yyyy/MM/dd") \ + .setRelaxedFactoryStrategy(MatchStrategy.MATCH_ALL) + + pipeline = Pipeline(stages=[document_assembler, date_matcher]) + model = pipeline.fit(self.data) + result = model.transform(self.data) + + actual_dates = result.select(size("date.result")).collect()[0][0] + self.assertEquals(actual_dates, 4) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala index 392bc45d2475fe..d44a921dab4076 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DateMatcherUtils.scala @@ -16,7 +16,8 @@ package com.johnsnowlabs.nlp.annotators -import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory} +import com.johnsnowlabs.nlp.util.io.MatchStrategy +import com.johnsnowlabs.nlp.util.regex.RuleFactory import org.apache.spark.ml.param._ import java.util.Calendar @@ -249,6 +250,31 @@ trait DateMatcherUtils extends Params { */ def setSourceLanguage(value: String): this.type = set(sourceLanguage, value) + /** Matched strategy to search relaxed dates by ordered rules by more exhaustive to less + * Strategy + * + * @group param + */ + val relaxedFactoryStrategy: Param[String] = + new Param(this, "relaxedFactoryStrategy", "Matched Strategy to searches relaxed dates") + + /** To set matched strategy to search relaxed dates by ordered rules by more exhaustive to less + * Strategy + * + * @group param + */ + def setRelaxedFactoryStrategy( + matchStrategy: MatchStrategy.Format = MatchStrategy.MATCH_FIRST): this.type = { + set(relaxedFactoryStrategy, matchStrategy.toString) + } + + /** To get matched strategy to search relaxed dates by ordered rules by more exhaustive to less + * Strategy + * + * @group param + */ + def getRelaxedFactoryStrategy: String = $(relaxedFactoryStrategy) + setDefault( inputFormats -> Array(""), outputFormat -> "yyyy/MM/dd", @@ -257,7 +283,8 @@ trait DateMatcherUtils extends Params { anchorDateDay -> -1, readMonthFirst -> true, defaultDayWhenMissing -> 1, - sourceLanguage -> "en") + sourceLanguage -> "en", + relaxedFactoryStrategy -> MatchStrategy.MATCH_FIRST.toString) protected val formalFactoryInputFormats = new RuleFactory(MatchStrategy.MATCH_ALL) @@ -322,11 +349,10 @@ trait DateMatcherUtils extends Params { .addRule(formalDateAlt2, "formal date with year at beginning") .addRule(formalDateShort, "formal date short version") - /** Searches relaxed dates by ordered rules by more exhaustive to less Strategy used is to match - * first only. any other matches discarded Auto completes short versions of months. Any two - * digit year is considered to be XX century + /** Searches relaxed dates by ordered rules by more exhaustive to less Strategy. Auto completes + * short versions of months. Any two digit year is considered to be XX century */ - protected val relaxedFactory: RuleFactory = new RuleFactory(MatchStrategy.MATCH_ALL) + protected lazy val relaxedFactory: RuleFactory = new RuleFactory(getRelaxedFactoryStrategy) .addRule(relaxedDayNumbered, "relaxed days") .addRule(relaxedMonths.r, "relaxed months exclusive") .addRule(relaxedYear, "relaxed year") diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala index 3925501330700d..54dfe18708a018 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcher.scala @@ -247,14 +247,15 @@ class MultiDateMatcher(override val uid: String) } private def extractRelaxedDate(text: String): Seq[MatchedDateTime] = { - val possibleDates = relaxedFactory.findMatch(text) - val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch) - possibleDatesByIndexMatch.flatMap{ case (_, possibleDates) => - computePossibleDates(possibleDates) - }.toSeq + val possibleDates = relaxedFactory.findMatch(text) + val possibleDatesByIndexMatch = possibleDates.groupBy(_.indexMatch) + possibleDatesByIndexMatch.flatMap { case (_, possibleDates) => + computePossibleDates(possibleDates) + }.toSeq } - private def computePossibleDates(possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = { + private def computePossibleDates( + possibleDates: Seq[RuleFactory.RuleMatch]): Seq[MatchedDateTime] = { var dayMatch = $(defaultDayWhenMissing) var monthMatch = defaultMonthWhenMissing var yearMatch = defaultYearWhenMissing @@ -263,7 +264,7 @@ class MultiDateMatcher(override val uid: String) possibleDates.foreach(possibleDate => { if (possibleDate.identifier == "relaxed days" && possibleDate.content.matched.exists( - _.isDigit)) { + _.isDigit)) { changes += 1 dayMatch = possibleDate.content.matched.filter(_.isDigit).toInt } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcherModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcherModel.scala index 9b34ca3f8f21b7..921aa9b6b85c65 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcherModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/RegexMatcherModel.scala @@ -19,8 +19,8 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.ArrayFeature -import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MatchStrategy -import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RegexRule, RuleFactory, TransformStrategy} +import com.johnsnowlabs.nlp.util.io.MatchStrategy +import com.johnsnowlabs.nlp.util.regex.{RegexRule, RuleFactory, TransformStrategy} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.Identifiable @@ -104,7 +104,7 @@ class RegexMatcherModel(override val uid: String) def getExternalRules: Array[(String, String)] = $$(externalRules) /** MATCH_ALL|MATCH_FIRST|MATCH_COMPLETE */ - private def getFactoryStrategy: MatchStrategy = $(strategy) match { + private def getFactoryStrategy: MatchStrategy.Format = $(strategy) match { case "MATCH_ALL" => MatchStrategy.MATCH_ALL case "MATCH_FIRST" => MatchStrategy.MATCH_FIRST case "MATCH_COMPLETE" => MatchStrategy.MATCH_COMPLETE diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala index 694811d52a99d2..cf4c6d132769c4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/Tokenizer.scala @@ -19,8 +19,8 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorApproach import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, TOKEN} import com.johnsnowlabs.nlp.annotators.param.ExternalResourceParam -import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} -import com.johnsnowlabs.nlp.util.regex.{MatchStrategy, RuleFactory} +import com.johnsnowlabs.nlp.util.io.{ExternalResource, MatchStrategy, ReadAs, ResourceHelper} +import com.johnsnowlabs.nlp.util.regex.RuleFactory import org.apache.spark.ml.PipelineModel import org.apache.spark.ml.param.{BooleanParam, IntParam, Param, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala index 9b8240fd56edc4..447e0673502ea9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticContentFormatter.scala @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticDictionaries._ import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.PragmaticSymbols._ -import com.johnsnowlabs.nlp.util.regex.MatchStrategy._ +import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL import com.johnsnowlabs.nlp.util.regex.TransformStrategy._ import com.johnsnowlabs.nlp.util.regex.{RegexRule, RuleFactory} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala index c2900ac8766df8..caec7375b45da3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/sbd/pragmatic/PragmaticMethod.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.nlp.annotators.sbd.pragmatic import com.johnsnowlabs.nlp.annotators.common.Sentence -import com.johnsnowlabs.nlp.util.regex.MatchStrategy.MATCH_ALL +import com.johnsnowlabs.nlp.util.io.MatchStrategy.MATCH_ALL import com.johnsnowlabs.nlp.util.regex.RuleFactory import com.johnsnowlabs.nlp.util.regex.TransformStrategy.{ REPLACE_ALL_WITH_SYMBOL, diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/io/MatchStrategy.scala b/src/main/scala/com/johnsnowlabs/nlp/util/io/MatchStrategy.scala new file mode 100644 index 00000000000000..1aaa3d5006006f --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/util/io/MatchStrategy.scala @@ -0,0 +1,37 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.util.io + +import com.johnsnowlabs.nlp.util.regex.RuleFactory + +/** Allowed strategies for [[RuleFactory]] applications regarding replacement */ +object MatchStrategy extends Enumeration { + + implicit def str2frmt(v: String): Format = { + v.toUpperCase match { + case "MATCH_ALL" => MATCH_ALL + case "MATCH_FIRST" => MATCH_FIRST + case "MATCH_COMPLETE" => MATCH_COMPLETE + case _ => + throw new MatchError( + s"Invalid MatchStrategy. Must be either of ${this.values.mkString("|")}") + } + } + + type Format = Value + val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala index 33b0c5cd6a34b2..e7da0c8bd347e6 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/util/regex/RuleFactory.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.util.regex import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.RuleSymbols +import com.johnsnowlabs.nlp.util.io.MatchStrategy import scala.util.matching.Regex @@ -27,7 +28,7 @@ import scala.util.matching.Regex * How to decide when replacing or transforming content with Regex */ class RuleFactory( - matchStrategy: MatchStrategy.MatchStrategy, + matchStrategy: MatchStrategy.Format, transformStrategy: TransformStrategy.TransformStrategy = TransformStrategy.NO_TRANSFORM) extends RuleSymbols with Serializable { @@ -69,7 +70,7 @@ class RuleFactory( matchStrategy match { case MATCH_ALL => rules.flatMap(rule => - rule.regex.findAllMatchIn(text).zipWithIndex.map{ case (currentMatch, index) => + rule.regex.findAllMatchIn(text).zipWithIndex.map { case (currentMatch, index) => RuleMatch(currentMatch, rule.identifier, index) }) case MATCH_FIRST => @@ -226,7 +227,7 @@ object RuleFactory { /** Specific partial constructor for [[RuleFactory]] where MatchStrategy might change on runtime */ def lateMatching(transformStrategy: TransformStrategy.TransformStrategy)( - matchStrategy: MatchStrategy.MatchStrategy): RuleFactory = + matchStrategy: MatchStrategy.Format): RuleFactory = new RuleFactory(matchStrategy, transformStrategy) /** Internal representation of a regex match @@ -246,9 +247,3 @@ object TransformStrategy extends Enumeration { REPLACE_WITH_SYMBOL_AND_BREAK, PROTECT_FROM_BREAK, BREAK_AND_PROTECT_FROM_BREAK, REPLACE_EACH_WITH_SYMBOL, REPLACE_EACH_WITH_SYMBOL_AND_BREAK = Value } - -/** Allowed strategies for [[RuleFactory]] applications regarding matching */ -object MatchStrategy extends Enumeration { - type MatchStrategy = Value - val MATCH_ALL, MATCH_FIRST, MATCH_COMPLETE = Value -} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala index a470090af5da0c..2768c0ebab76a6 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/MultiDateMatcherTestSpec.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.AnnotatorType.DATE +import com.johnsnowlabs.nlp.util.io.MatchStrategy import com.johnsnowlabs.nlp.{Annotation, AnnotatorType, DataBuilder} import com.johnsnowlabs.tags.FastTest import org.apache.spark.sql.{Dataset, Row} @@ -307,4 +308,31 @@ class MultiDateMatcherTestSpec extends AnyFlatSpec with DateMatcherBehaviors { assert(results == expectedDates) } + + "a MultiDateMatcher" should "correctly find all possible dates in a text" taggedAs FastTest in { + + val data: Dataset[Row] = DataBuilder.multipleDataBuild(Array(""" + Lease Period Monthly Installment of Base Rent + January 1, 2021 –December 31, 2021 $20,304.85* + January 1, 2022 –December 31, 2022 $20,914.00 + """)) + + val dateMatcher = new MultiDateMatcher() + .setInputCols(Array("document")) + .setOutputCol("date") + .setOutputFormat("yyyy/MM/dd") + .setRelaxedFactoryStrategy(MatchStrategy.MATCH_ALL) + .transform(data) + + val results = Annotation.collect(dateMatcher, "date").flatten.toSeq.sortBy(_.end) + + val expectedDates = Seq( + Annotation(DATE, 67, 81, "2021/01/01", Map("sentence" -> "0")), + Annotation(DATE, 84, 100, "2021/12/31", Map("sentence" -> "0")), + Annotation(DATE, 103, 138, "2022/01/20", Map("sentence" -> "0")), + Annotation(DATE, 132, 157, "2022/12/01", Map("sentence" -> "0"))) + + assert(results == expectedDates) + } + } From 135ecea4573366d7728ec05817252510dd081fad Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Tue, 9 May 2023 11:20:23 +0000 Subject: [PATCH 20/32] Added a generate method to support text generation. --- .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 269 ++++++++---------- .../ml/ai/util/Generation/Generate.scala | 216 +++++++++++--- 2 files changed, 293 insertions(+), 192 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index fa50bddb24a3c7..0d0aa72696618e 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -17,35 +17,20 @@ package com.johnsnowlabs.ml.ai import com.johnsnowlabs.ml.ai.util.Generation.Generate -import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitProcess.{ - MinLengthLogitProcessor, - NoRepeatNgramsLogitProcessor, - RepetitionPenaltyLogitProcessor -} -import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitProcessorList -import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitWarper.{ - TemperatureLogitWarper, - TopKLogitWarper, - TopPLogitWarper -} -import com.johnsnowlabs.ml.ai.util.Generation.Search.BeamSearchScorer -import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} +import com.johnsnowlabs.ml.tensorflow.sign.ModelSignatureManager import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BpeTokenizer, BartTokenizer} import com.johnsnowlabs.nlp.annotators.common.SentenceSplit +import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BartTokenizer, BpeTokenizer} import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import org.tensorflow.{Session, Tensor} -import scala.collection.JavaConverters._ -import scala.collection.mutable +import scala.collection.JavaConverters.* /** This class is used to run Bart model for For Sequence Batches of WordpieceTokenizedSentence. * Input for this model must be tokenized with a SentencePieceModel, * * @param tensorflow * BART Model wrapper with TensorFlowWrapper - * @param bpeTokenizer - * BART Byte-Pair Encoder model with BPEWrapper * @param configProtoBytes * Configuration for TensorFlow session */ @@ -60,16 +45,14 @@ private[johnsnowlabs] class Bart( extends Serializable with Generate { - private val _tfBartSignatures: Map[String, String] = - signatures.getOrElse(ModelSignatureManager.apply()) - val bpeTokenizer: BartTokenizer = BpeTokenizer .forModel("bart", merges = merges, vocab = vocabulary, padWithSentenceTokens = false) .asInstanceOf[BartTokenizer] - + private val _tfBartSignatures: Map[String, String] = + signatures.getOrElse(ModelSignatureManager.apply()) private val paddingTokenId = 1 private val eosTokenId = 2 - private val vocab_size = 50264 + private val vocabSize = 50264 private val encoderInputIdsKey = "encoder_encoder_input_ids:0" private val encoderAttentionMaskKey = "encoder_encoder_attention_mask:0" private val encoderOutputKey = "StatefulPartitionedCall_2:0" @@ -91,29 +74,90 @@ private[johnsnowlabs] class Bart( private val decoderCachedOutputLogitsKey = "StatefulPartitionedCall:2" private val decoderCachedOutputCache1Key = "StatefulPartitionedCall:0" private val decoderCachedOutputCache2Key = "StatefulPartitionedCall:1" + var tensorDecoder = new TensorResources() private var nextStateTensor1: Option[org.tensorflow.Tensor] = None private var nextStateTensor2: Option[org.tensorflow.Tensor] = None - var tensorDecoder = new TensorResources() - private def sessionWarmup(): Unit = { - val dummyInput = Array.fill(1)(0) ++ Array(eosTokenId) - tag( - Seq(dummyInput), - minOutputLength = 0, - maxOutputLength = 1, - doSample = false, - temperature = 0f, - topK = 0, - topP = 0f, - repetitionPenalty = 0f, - noRepeatNgramSize = 0, - randomSeed = Option(0), - ignoreTokenIds = Array(0), - beamSize = 1) - } + /** @param sentences + * @param batchSize + * @param minOutputLength + * @param maxOutputLength + * @param doSample + * @param temperature + * @param topK + * @param topP + * @param repetitionPenalty + * @param noRepeatNgramSize + * @param task + * @param randomSeed + * @param ignoreTokenIds + * @param beamSize + * @return + */ + def predict( + sentences: Seq[Annotation], + batchSize: Int, + minOutputLength: Int, + maxOutputLength: Int, + doSample: Boolean, + temperature: Double, + topK: Int, + topP: Double, + repetitionPenalty: Double, + noRepeatNgramSize: Int, + task: String, + randomSeed: Option[Long] = None, + ignoreTokenIds: Array[Int] = Array(), + beamSize: Int): Seq[Annotation] = { + + val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch => + val batchSP = encode(batch, task) + val spIds = tag( + batchSP, + minOutputLength, + maxOutputLength, + doSample, + temperature, + topK, + topP, + repetitionPenalty, + noRepeatNgramSize, + randomSeed, + ignoreTokenIds, + beamSize) -// sessionWarmup() + decode(spIds) + + } + var sentBegin, nextSentEnd = 0 + batchDecoder.zip(sentences).map { case (content, sent) => + nextSentEnd += content.length - 1 + val annots = new Annotation( + annotatorType = AnnotatorType.DOCUMENT, + begin = sentBegin, + end = nextSentEnd, + result = content, + metadata = sent.metadata) + sentBegin += nextSentEnd + 1 + annots + } + } + + /** @param batch + * @param minOutputLength + * @param maxOutputLength + * @param doSample + * @param temperature + * @param topK + * @param topP + * @param repetitionPenalty + * @param noRepeatNgramSize + * @param randomSeed + * @param ignoreTokenIds + * @param beamSize + * @return + */ def tag( batch: Seq[Array[Int]], minOutputLength: Int, @@ -127,9 +171,9 @@ private[johnsnowlabs] class Bart( randomSeed: Option[Long], ignoreTokenIds: Array[Int] = Array(), beamSize: Int): Array[Array[Int]] = { + val ignoreTokenIdsInt = ignoreTokenIds val expandedEncoderInputIdsVals = batch.flatMap(x => List.fill(beamSize)(x)) -// val expandedEncoderInputIdsVals = batch val sequencesLength = expandedEncoderInputIdsVals.map(x => x.length).toArray val maxSentenceLength = sequencesLength.max // - curLen @@ -176,7 +220,7 @@ private[johnsnowlabs] class Bart( val encoderAttentionMaskTensors = tensorEncoder.createIntBufferTensor(shape, encoderAttentionMaskBuffers) - val runner = session.runner; + val runner = session.runner runner .feed(encoderInputIdsKey, encoderInputTensors) @@ -207,11 +251,12 @@ private[johnsnowlabs] class Bart( val decoderEncoderStateTensors = tensorEncoder.createFloatBufferTensor( Array(expandedEncoderInputIdsVals.length, maxSentenceLength, dim), decoderEncoderStateBuffers) - - val modelOutputs = generateBeamSearch( + val decoderInputs = batch.map(_ => Array(this.eosTokenId)).toArray + val modelOutputs = generate( batch, decoderEncoderStateTensors, encoderAttentionMaskTensors, + decoderInputs, maxOutputLength, minOutputLength, doSample, @@ -222,6 +267,9 @@ private[johnsnowlabs] class Bart( topP, repetitionPenalty, noRepeatNgramSize, + this.vocabSize, + this.eosTokenId, + this.paddingTokenId, randomSeed, ignoreTokenIdsInt, session) @@ -236,72 +284,10 @@ private[johnsnowlabs] class Bart( tensorDecoder.clearTensors() nextStateTensor1 = None nextStateTensor2 = None - } modelOutputs } - def generateBeamSearch( - inputIds: Seq[Array[Int]], - decoderEncoderStateTensors: Tensor, - encoderAttentionMaskTensors: Tensor, - maxOutputLength: Int, - minOutputLength: Int, - doSample: Boolean, - beamSize: Int, - numReturnSequences: Int, - temperature: Double, - topK: Int, - topP: Double, - repetitionPenalty: Double, - noRepeatNgramSize: Int, - randomSeed: Option[Long], - ignoreTokenIds: Array[Int] = Array(), - session: Session): Array[Array[Int]] = { - - var decoderInputs = inputIds.map(_ => Array(this.eosTokenId)).toArray - - var logitProcessorList = new LogitProcessorList() - - logitProcessorList.addProcess(new RepetitionPenaltyLogitProcessor(repetitionPenalty)) - - logitProcessorList.addProcess( - new NoRepeatNgramsLogitProcessor( - noRepeatNgramSize = noRepeatNgramSize, - vocabSize = this.vocab_size)) - - logitProcessorList.addProcess( - new MinLengthLogitProcessor(this.eosTokenId, minOutputLength, this.vocab_size)) - - logitProcessorList.addProcess(new TemperatureLogitWarper(temperature)) - - logitProcessorList.addProcess(new TopKLogitWarper(topK)) - - logitProcessorList.addProcess(new TopPLogitWarper(topP)) - - val beamSearchScorer = new BeamSearchScorer( - beamSize = beamSize, - batchSize = inputIds.length, - lengthPenalty = repetitionPenalty.toFloat, - doEarlyStopping = false, - numBeamHypothesisToKeep = numReturnSequences, - maxLength = maxOutputLength) - - this.beamSearch( - inputIds, - decoderInputs, - decoderEncoderStateTensors, - encoderAttentionMaskTensors, - beamSearchScorer, - logitProcessorList, - maxOutputLength, - this.paddingTokenId, - this.eosTokenId, - doSample, - randomSeed, - session) - } - def decode(sentences: Array[Array[Int]]): Seq[String] = { sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt))) } @@ -320,56 +306,6 @@ private[johnsnowlabs] class Bart( }) } - def predict( - sentences: Seq[Annotation], - batchSize: Int, - minOutputLength: Int, - maxOutputLength: Int, - doSample: Boolean, - temperature: Double, - topK: Int, - topP: Double, - repetitionPenalty: Double, - noRepeatNgramSize: Int, - task: String, - randomSeed: Option[Long] = None, - ignoreTokenIds: Array[Int] = Array(), - beamSize: Int): Seq[Annotation] = { - - val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch => - val batchSP = encode(batch, task) - val spIds = tag( - batchSP, - minOutputLength, - maxOutputLength, - doSample, - temperature, - topK, - topP, - repetitionPenalty, - noRepeatNgramSize, - randomSeed, - ignoreTokenIds, - beamSize) - - decode(spIds) - - } - - var sentBegin, nextSentEnd = 0 - batchDecoder.zip(sentences).map { case (content, sent) => - nextSentEnd += content.length - 1 - val annots = new Annotation( - annotatorType = AnnotatorType.DOCUMENT, - begin = sentBegin, - end = nextSentEnd, - result = content, - metadata = sent.metadata) - sentBegin += nextSentEnd + 1 - annots - } - } - override def getModelOutput( encoderInputIds: Seq[Array[Int]], decoderInputIds: Seq[Array[Int]], @@ -381,7 +317,7 @@ private[johnsnowlabs] class Bart( val sequencesLength = encoderInputIds.map(x => x.length).toArray var maxSentenceLength = sequencesLength.max // - curLen maxSentenceLength = Math.max(maxSentenceLength, maxLength) - val vocabSize = this.vocab_size + val vocabSize = this.vocabSize val decoderInputLength = decoderInputIds.head.length val batchSize = encoderInputIds.length @@ -458,4 +394,21 @@ private[johnsnowlabs] class Bart( decoderInputTensors.close() nextTokenLogits } + + private def sessionWarmup(): Unit = { + val dummyInput = Array.fill(1)(0) ++ Array(eosTokenId) + tag( + Seq(dummyInput), + minOutputLength = 0, + maxOutputLength = 1, + doSample = false, + temperature = 0f, + topK = 0, + topP = 0f, + repetitionPenalty = 0f, + noRepeatNgramSize = 0, + randomSeed = Option(0), + ignoreTokenIds = Array(0), + beamSize = 1) + } } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala index 892cbe708c1e02..40c49d94dc3f51 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala @@ -16,13 +16,161 @@ package com.johnsnowlabs.ml.ai.util.Generation import com.johnsnowlabs.ml.ai.util.Generation.Search.BeamScorer -import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitProcessorList import scala.math._ import scala.util.control.Breaks._ -import scala.util.Random +import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitProcess.{ + MinLengthLogitProcessor, + NoRepeatNgramsLogitProcessor, + RepetitionPenaltyLogitProcessor +} +import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitProcessorList +import com.johnsnowlabs.ml.ai.util.Generation.Logit.LogitWarper.{ + TemperatureLogitWarper, + TopKLogitWarper, + TopPLogitWarper +} + +import com.johnsnowlabs.ml.ai.util.Generation.Search.BeamSearchScorer import org.tensorflow.{Session, Tensor} trait Generate { + + /** Text Generation using Beam Search + * @param inputIds + * input ids + * @param decoderEncoderStateTensors + * decoder encoder state tensors + * @param encoderAttentionMaskTensors + * encoder attention mask tensors + * @param decoderInputs + * decoder inputs + * @param maxOutputLength + * max output length + * @param minOutputLength + * min output length + * @param doSample + * do sample + * @param beamSize + * beam size + * @param numReturnSequences + * num return sequences + * @param temperature + * temperature + * @param topK + * top K + * @param topP + * top P + * @param repetitionPenalty + * repetition penalty + * @param noRepeatNgramSize + * no repeat ngram size + * @param vocabSize + * vocab size + * @param eosTokenId + * eos token id + * @param paddingTokenId + * padding token id + * @param randomSeed + * random seed + * @param ignoreTokenIds + * ignore token ids + * @param session + * session + * @return + * Array of generated sequences + */ + def generate( + inputIds: Seq[Array[Int]], + decoderEncoderStateTensors: Tensor, + encoderAttentionMaskTensors: Tensor, + decoderInputs: Array[Array[Int]], + maxOutputLength: Int, + minOutputLength: Int, + doSample: Boolean, + beamSize: Int, + numReturnSequences: Int, + temperature: Double, + topK: Int, + topP: Double, + repetitionPenalty: Double, + noRepeatNgramSize: Int, + vocabSize: Int, + eosTokenId: Int, + paddingTokenId: Int, + randomSeed: Option[Long], + ignoreTokenIds: Array[Int] = Array(), + session: Session): Array[Array[Int]] = { + + // TODO: Add support for ignoreTokenIds + + val logitProcessorList = new LogitProcessorList() + + logitProcessorList.addProcess(new RepetitionPenaltyLogitProcessor(repetitionPenalty)) + + logitProcessorList.addProcess( + new NoRepeatNgramsLogitProcessor( + noRepeatNgramSize = noRepeatNgramSize, + vocabSize = vocabSize)) + + logitProcessorList.addProcess( + new MinLengthLogitProcessor(eosTokenId, minOutputLength, vocabSize)) + + logitProcessorList.addProcess(new TemperatureLogitWarper(temperature)) + + logitProcessorList.addProcess(new TopKLogitWarper(topK)) + + logitProcessorList.addProcess(new TopPLogitWarper(topP)) + + val beamSearchScorer = new BeamSearchScorer( + beamSize = beamSize, + batchSize = inputIds.length, + lengthPenalty = repetitionPenalty.toFloat, + doEarlyStopping = false, + numBeamHypothesisToKeep = numReturnSequences, + maxLength = maxOutputLength) + + this.beamSearch( + inputIds, + decoderInputs, + decoderEncoderStateTensors, + encoderAttentionMaskTensors, + beamSearchScorer, + logitProcessorList, + maxOutputLength, + paddingTokenId, + eosTokenId, + doSample, + randomSeed, + session) + } + + /** Beam Search for text generation + * @param encoderInputIdsVals + * encoder input ids vals + * @param inputIdsVal + * input ids val + * @param decoderEncoderStateTensors + * decoder encoder state tensors + * @param encoderAttentionMaskTensors + * encoder attention mask tensors + * @param beamScorer + * beam scorer + * @param logitProcessor + * logit processor + * @param maxLength + * max length + * @param padTokenId + * pad token id + * @param eosTokenId + * eos token id + * @param doSample + * do sample + * @param randomSeed + * random seed + * @param session + * session + * @return + */ def beamSearch( encoderInputIdsVals: Seq[Array[Int]], inputIdsVal: Seq[Array[Int]], @@ -99,7 +247,7 @@ trait Generate { nextKTopTokenScores = Array.ofDim[Float](nextKIndices.length, nextKIndices.head.length) for (i <- nextKIndices.indices) { for (j <- nextKIndices(i).indices) { - nextKTopTokenScores(i)(j) = nextTokenScores(i)(nextKIndices(i)(j).toInt) + nextKTopTokenScores(i)(j) = nextTokenScores(i)(nextKIndices(i)(j)) } } nextKTopTokenScores = @@ -137,13 +285,13 @@ trait Generate { var newInputIds = Seq[Array[Int]]() for ((i, ind) <- beamIdx.zipWithIndex) { - val tempInput = expandedInputs(i.toInt) :+ beamNextTokens(ind) - newInputIds = newInputIds :+ (tempInput) + val tempInput = expandedInputs(i) :+ beamNextTokens(ind) + newInputIds = newInputIds :+ tempInput } expandedInputs = newInputIds beamScores = newBeamScores beamIndices = beamIndices.indices.map { elem => - beamIndices(beamIdx(elem).toInt) :+ beamIdx(elem) + beamIndices(beamIdx(elem)) :+ beamIdx(elem) } currentLength = currentLength + 1 if (beamScorer.isDone || (expandedInputs.head.length >= maxLength)) { @@ -165,14 +313,6 @@ trait Generate { sequenceOutputs._1 } - def getModelOutput( - encoderInputIds: Seq[Array[Int]], - decoderInputIds: Seq[Array[Int]], - decoderEncoderStateTensors: Tensor, - encoderAttentionMaskTensors: Tensor, - maxLength: Int, - session: Session): Array[Array[Float]] - def logSoftmax(values: Array[Float]): Array[Float] = { val c = values.max val expElem = values.map(x => exp(x - c)) @@ -202,26 +342,6 @@ trait Generate { outputArray // Return the reshaped array } - def sample(logits: Seq[Float], k: Int, seed: Long = 42): Array[Int] = { - val maxLogit = logits.max - val logitsExp = logits.map(logit => math.exp(logit - maxLogit)) - val sumExp = logitsExp.sum - val probs = logitsExp.map(exp => exp / sumExp) - val SeededRandom = new scala.util.Random(seed) - val randSeq = Seq.fill(k)(SeededRandom.nextDouble()) - var cumProb = 0.0 - var index = 0 - var results = Seq[Int]() - for (rand <- randSeq) { - while (index < probs.length - 1 && cumProb + probs(index) < rand) { - cumProb += probs(index) - index += 1 - } - results :+= index - } - results.toArray - } - def multinomialSampling(logitValues: Array[Float], k: Int, seed: Option[Long]): Array[Int] = { val (distFiltered, indices) = logitValues.zipWithIndex.filter { case (elem, index) => !elem.isInfinite }.sorted.unzip @@ -259,6 +379,34 @@ trait Generate { selectedIndices } + def getModelOutput( + encoderInputIds: Seq[Array[Int]], + decoderInputIds: Seq[Array[Int]], + decoderEncoderStateTensors: Tensor, + encoderAttentionMaskTensors: Tensor, + maxLength: Int, + session: Session): Array[Array[Float]] + + def sample(logits: Seq[Float], k: Int, seed: Long = 42): Array[Int] = { + val maxLogit = logits.max + val logitsExp = logits.map(logit => math.exp(logit - maxLogit)) + val sumExp = logitsExp.sum + val probs = logitsExp.map(exp => exp / sumExp) + val SeededRandom = new scala.util.Random(seed) + val randSeq = Seq.fill(k)(SeededRandom.nextDouble()) + var cumProb = 0.0 + var index = 0 + var results = Seq[Int]() + for (rand <- randSeq) { + while (index < probs.length - 1 && cumProb + probs(index) < rand) { + cumProb += probs(index) + index += 1 + } + results :+= index + } + results.toArray + } + def softmax(logitValues: Array[Float]): Array[Float] = { val maxLogit = logitValues.max val logitsExp = logitValues.map(l => Math.exp(l - maxLogit)) From 3947ba02b919ccd231c2efbba91f86044d2a1cd9 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Tue, 9 May 2023 11:31:41 +0000 Subject: [PATCH 21/32] Added a comments and method parameters --- .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 76 ++++++++++++------- .../ml/ai/util/Generation/Generate.scala | 34 ++++++++- 2 files changed, 82 insertions(+), 28 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index 0d0aa72696618e..9e230e941a9783 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -78,20 +78,20 @@ private[johnsnowlabs] class Bart( private var nextStateTensor1: Option[org.tensorflow.Tensor] = None private var nextStateTensor2: Option[org.tensorflow.Tensor] = None - /** @param sentences - * @param batchSize - * @param minOutputLength - * @param maxOutputLength - * @param doSample - * @param temperature - * @param topK - * @param topP - * @param repetitionPenalty - * @param noRepeatNgramSize - * @param task - * @param randomSeed - * @param ignoreTokenIds - * @param beamSize + /** @param sentences Sequence of WordpieceTokenizedSentence + * @param batchSize Batch size + * @param minOutputLength Minimum length of output + * @param maxOutputLength Maximum length of output + * @param doSample Whether to sample or not + * @param temperature Temperature for sampling + * @param topK Top K for sampling + * @param topP Top P for sampling + * @param repetitionPenalty Repetition penalty for sampling + * @param noRepeatNgramSize No repeat ngram size for sampling + * @param task Task + * @param randomSeed Random seed + * @param ignoreTokenIds Ignore token ids + * @param beamSize Beam size * @return */ def predict( @@ -144,19 +144,19 @@ private[johnsnowlabs] class Bart( } } - /** @param batch - * @param minOutputLength - * @param maxOutputLength - * @param doSample - * @param temperature - * @param topK - * @param topP - * @param repetitionPenalty - * @param noRepeatNgramSize - * @param randomSeed - * @param ignoreTokenIds - * @param beamSize - * @return + /** @param batch Sequence of WordpieceTokenizedSentence + * @param minOutputLength Minimum length of output + * @param maxOutputLength Maximum length of output + * @param doSample Whether to sample or not + * @param temperature Temperature for sampling + * @param topK Top K for sampling + * @param topP Top P for sampling + * @param repetitionPenalty Repetition penalty for sampling + * @param noRepeatNgramSize No repeat ngram size for sampling + * @param randomSeed Random seed + * @param ignoreTokenIds Ignore token ids + * @param beamSize Beam size + * @return Sequence of WordpieceTokenizedSentence */ def tag( batch: Seq[Array[Int]], @@ -288,10 +288,21 @@ private[johnsnowlabs] class Bart( modelOutputs } + /** + * Decode a sequence of sentences + * @param sentences Sequence of sentences + * @return Sequence of decoded sentences + */ def decode(sentences: Array[Array[Int]]): Seq[String] = { sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt))) } + /** + * Encode a sequence of sentences + * @param sentences Sequence of sentences + * @param task Task + * @return Sequence of encoded sentences + */ def encode(sentences: Seq[Annotation], task: String): Seq[Array[Int]] = { SentenceSplit .unpack(sentences) @@ -306,6 +317,17 @@ private[johnsnowlabs] class Bart( }) } + + /** + * Get model output for a batch of input sequences + * @param encoderInputIds input ids + * @param decoderInputIds decoder input ids + * @param decoderEncoderStateTensors encoder state + * @param encoderAttentionMaskTensors attention mask + * @param maxLength max length + * @param session tensorflow session + * @return model output + */ override def getModelOutput( encoderInputIds: Seq[Array[Int]], decoderInputIds: Seq[Array[Int]], diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala index 40c49d94dc3f51..ef1328233454e2 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala @@ -320,6 +320,17 @@ trait Generate { values.map(x => (x - c - logSumExp).toFloat) } + /** Reshapes a 1D array into a 2D array with the specified number of rows and columns. + * + * @param inputArray + * The input array to reshape + * @param numRows + * The number of rows in the output array + * @param numCols + * The number of columns in the output array + * @return + * The reshaped array + */ def reshapeArray( inputArray: Array[Array[Float]], numRows: Int, @@ -342,6 +353,17 @@ trait Generate { outputArray // Return the reshaped array } + /** Samples from a multinomial distribution using the provided logits. + * + * @param logitValues + * The logits to sample from + * @param k + * The number of samples to draw + * @param seed + * The random seed to use + * @return + * The sampled indices + */ def multinomialSampling(logitValues: Array[Float], k: Int, seed: Option[Long]): Array[Int] = { val (distFiltered, indices) = logitValues.zipWithIndex.filter { case (elem, index) => !elem.isInfinite }.sorted.unzip @@ -351,7 +373,6 @@ trait Generate { val sumExpLogitValues = expLogitValues.sum val probabilities = expLogitValues.map(_ / sumExpLogitValues) -// val indices = Array.range(0, logitValues.length) val selectedIndices = new Array[Int](k) var seededRandom = new scala.util.Random() if (seed.isDefined) { @@ -387,6 +408,17 @@ trait Generate { maxLength: Int, session: Session): Array[Array[Float]] + /** Samples from a multinomial distribution using the provided logits. + * + * @param logits + * The logits to sample from + * @param k + * The number of samples to draw + * @param seed + * The random seed to use + * @return + * The sampled indices + */ def sample(logits: Seq[Float], k: Int, seed: Long = 42): Array[Int] = { val maxLogit = logits.max val logitsExp = logits.map(logit => math.exp(logit - maxLogit)) From b38cd21c81ac900815c0cde206e7dfcab55ad402 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Tue, 9 May 2023 11:31:41 +0000 Subject: [PATCH 22/32] Added a comments and method parameters --- .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 135 +++++++++++------- 1 file changed, 85 insertions(+), 50 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index 9e230e941a9783..e0c5d771c2b3a0 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -24,7 +24,7 @@ import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BartTokenizer, BpeTokenize import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import org.tensorflow.{Session, Tensor} -import scala.collection.JavaConverters.* +import scala.collection.JavaConverters._ /** This class is used to run Bart model for For Sequence Batches of WordpieceTokenizedSentence. * Input for this model must be tokenized with a SentencePieceModel, @@ -78,20 +78,34 @@ private[johnsnowlabs] class Bart( private var nextStateTensor1: Option[org.tensorflow.Tensor] = None private var nextStateTensor2: Option[org.tensorflow.Tensor] = None - /** @param sentences Sequence of WordpieceTokenizedSentence - * @param batchSize Batch size - * @param minOutputLength Minimum length of output - * @param maxOutputLength Maximum length of output - * @param doSample Whether to sample or not - * @param temperature Temperature for sampling - * @param topK Top K for sampling - * @param topP Top P for sampling - * @param repetitionPenalty Repetition penalty for sampling - * @param noRepeatNgramSize No repeat ngram size for sampling - * @param task Task - * @param randomSeed Random seed - * @param ignoreTokenIds Ignore token ids - * @param beamSize Beam size + /** @param sentences + * Sequence of WordpieceTokenizedSentence + * @param batchSize + * Batch size + * @param minOutputLength + * Minimum length of output + * @param maxOutputLength + * Maximum length of output + * @param doSample + * Whether to sample or not + * @param temperature + * Temperature for sampling + * @param topK + * Top K for sampling + * @param topP + * Top P for sampling + * @param repetitionPenalty + * Repetition penalty for sampling + * @param noRepeatNgramSize + * No repeat ngram size for sampling + * @param task + * Task + * @param randomSeed + * Random seed + * @param ignoreTokenIds + * Ignore token ids + * @param beamSize + * Beam size * @return */ def predict( @@ -144,19 +158,32 @@ private[johnsnowlabs] class Bart( } } - /** @param batch Sequence of WordpieceTokenizedSentence - * @param minOutputLength Minimum length of output - * @param maxOutputLength Maximum length of output - * @param doSample Whether to sample or not - * @param temperature Temperature for sampling - * @param topK Top K for sampling - * @param topP Top P for sampling - * @param repetitionPenalty Repetition penalty for sampling - * @param noRepeatNgramSize No repeat ngram size for sampling - * @param randomSeed Random seed - * @param ignoreTokenIds Ignore token ids - * @param beamSize Beam size - * @return Sequence of WordpieceTokenizedSentence + /** @param batch + * Sequence of WordpieceTokenizedSentence + * @param minOutputLength + * Minimum length of output + * @param maxOutputLength + * Maximum length of output + * @param doSample + * Whether to sample or not + * @param temperature + * Temperature for sampling + * @param topK + * Top K for sampling + * @param topP + * Top P for sampling + * @param repetitionPenalty + * Repetition penalty for sampling + * @param noRepeatNgramSize + * No repeat ngram size for sampling + * @param randomSeed + * Random seed + * @param ignoreTokenIds + * Ignore token ids + * @param beamSize + * Beam size + * @return + * Sequence of WordpieceTokenizedSentence */ def tag( batch: Seq[Array[Int]], @@ -288,21 +315,24 @@ private[johnsnowlabs] class Bart( modelOutputs } - /** - * Decode a sequence of sentences - * @param sentences Sequence of sentences - * @return Sequence of decoded sentences - */ + /** Decode a sequence of sentences + * @param sentences + * Sequence of sentences + * @return + * Sequence of decoded sentences + */ def decode(sentences: Array[Array[Int]]): Seq[String] = { sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt))) } - /** - * Encode a sequence of sentences - * @param sentences Sequence of sentences - * @param task Task - * @return Sequence of encoded sentences - */ + /** Encode a sequence of sentences + * @param sentences + * Sequence of sentences + * @param task + * Task + * @return + * Sequence of encoded sentences + */ def encode(sentences: Seq[Annotation], task: String): Seq[Array[Int]] = { SentenceSplit .unpack(sentences) @@ -317,17 +347,22 @@ private[johnsnowlabs] class Bart( }) } - - /** - * Get model output for a batch of input sequences - * @param encoderInputIds input ids - * @param decoderInputIds decoder input ids - * @param decoderEncoderStateTensors encoder state - * @param encoderAttentionMaskTensors attention mask - * @param maxLength max length - * @param session tensorflow session - * @return model output - */ + /** Get model output for a batch of input sequences + * @param encoderInputIds + * input ids + * @param decoderInputIds + * decoder input ids + * @param decoderEncoderStateTensors + * encoder state + * @param encoderAttentionMaskTensors + * attention mask + * @param maxLength + * max length + * @param session + * tensorflow session + * @return + * model output + */ override def getModelOutput( encoderInputIds: Seq[Array[Int]], decoderInputIds: Seq[Array[Int]], From e95e2fe179dc465b46412b54b8777379a08cb692 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 10 May 2023 11:58:31 +0200 Subject: [PATCH 23/32] Rollback on hard exception for isProtected - let's not throw hard exception in the minor release - will log this later for clarity of the users --- .../com/johnsnowlabs/nlp/serialization/Feature.scala | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala index 2c915e040ae577..85332fec330ab4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/serialization/Feature.scala @@ -118,13 +118,8 @@ abstract class Feature[Serializable1, Serializable2, TComplete: ClassTag]( } final def setValue(value: Option[Any]): HasFeatures = { - if (isProtected && isSet) - throw new IllegalArgumentException( - "Trying to set a protected parameter, which was already set." + - " The parameter you are trying to set is protected and can only be set once." + - " For a pretrained model, this was done during the initialization process." + - " If you are trying to train your own model, please check the documentation.") - + // TODO: make sure we log if there is any protected param is being set + // if (isProtected && isSet) if (useBroadcast) { if (isSet) broadcastValue.get.destroy() broadcastValue = From 646040a12919bba7ac0955895593e10125e727d8 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 10 May 2023 13:06:52 +0200 Subject: [PATCH 24/32] Update coding styles [skip test] --- python/sparknlp/annotator/classifier_dl/__init__.py | 2 +- .../roberta_bert_for_zero_shot_classification.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index ce4a94069d3145..816d0328ecf55b 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -45,4 +45,4 @@ from sparknlp.annotator.classifier_dl.camembert_for_question_answering import * from sparknlp.annotator.classifier_dl.bert_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.distil_bert_for_zero_shot_classification import * -from sparknlp.annotator.classifier_dl.roberta_bert_for_zero_shot_classification import * \ No newline at end of file +from sparknlp.annotator.classifier_dl.roberta_bert_for_zero_shot_classification import * diff --git a/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py index d48d37992ac583..d8ae8d285bbd59 100644 --- a/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py +++ b/python/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.py @@ -17,11 +17,11 @@ class RoBertaForZeroShotClassification(AnnotatorModel, - HasCaseSensitiveProperties, - HasBatchedAnnotate, - HasClassifierActivationProperties, - HasCandidateLabelsProperties, - HasEngine): + HasCaseSensitiveProperties, + HasBatchedAnnotate, + HasClassifierActivationProperties, + HasCandidateLabelsProperties, + HasEngine): """RoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language inference) tasks. Equivalent of `RoBertaForSequenceClassification` models, but these models don't require a hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more From c262df11de9353ce14b51333f3098acc73553aec Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 10 May 2023 13:54:36 +0000 Subject: [PATCH 25/32] Fix the unit test --- .../scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala index 6a154bf9a350a6..9e826e37c3f552 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/HasFeaturesTestSpec.scala @@ -21,9 +21,12 @@ class HasFeaturesTestSpec extends AnyFlatSpec { model.setProtectedMockFeature("first") assert(model.getProtectedMockFeature == "first") - assertThrows[IllegalArgumentException] { - model.setProtectedMockFeature("second") - } +// assertThrows[IllegalArgumentException] { +// model.setProtectedMockFeature("second") +// } + model.setProtectedMockFeature("second") + // should stay the same as the first value + assert(model.getProtectedMockFeature == "first") } } From 8697f454a62cb078067827a27900591df25baf4a Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 10 May 2023 16:22:14 +0200 Subject: [PATCH 26/32] Bump to 4.4.2 [run doc] --- README.md | 149 +++++++++--------- build.sbt | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 58 +++---- docs/en/spark_nlp.md | 2 +- examples/docker/README.md | 4 +- .../Text_Summarization_with_BART.ipynb | 2 +- examples/util/Training_Helpers.ipynb | 4 +- python/README.md | 149 +++++++++--------- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 19 files changed, 199 insertions(+), 197 deletions(-) diff --git a/README.md b/README.md index d8e5c09c2707e2..4ec5fc96710c67 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,7 @@ documentation and examples - Longformer for Question Answering - Table Question Answering (TAPAS) - Zero-Shot NER Model -- Zero Shot Text Classification by BERT (ZSL) +- Zero Shot Text Classification by Transformers (ZSL) - Neural Machine Translation (MarianMT) - Text-To-Text Transfer Transformer (Google T5) - Generative Pre-trained Transformer 2 (OpenAI GPT2) @@ -165,7 +165,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 4.4.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 4.4.2 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -181,7 +181,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -226,22 +226,21 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *4.4.1* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, and -3.3.x: - -| Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | -|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| -| 4.4.x | NO | NO | YES | YES | YES | YES | -| 4.3.x | NO | NO | YES | YES | YES | YES | -| 4.2.x | NO | NO | YES | YES | YES | YES | -| 4.1.x | NO | NO | YES | YES | YES | YES | -| 4.0.x | NO | NO | YES | YES | YES | YES | -| 3.4.x | YES | YES | YES | YES | Partially | N/A | -| 3.3.x | YES | YES | YES | YES | NO | NO | -| 3.2.x | YES | YES | YES | YES | NO | NO | -| 3.1.x | YES | YES | YES | YES | NO | NO | -| 3.0.x | YES | YES | YES | YES | NO | NO | -| 2.7.x | YES | YES | NO | NO | NO | NO | +Spark NLP *4.4.2* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x + +| Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | +|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| +| 4.4.x | NO | NO | YES | YES | YES | YES | YES | +| 4.3.x | NO | NO | YES | YES | YES | YES | NO | +| 4.2.x | NO | NO | YES | YES | YES | YES | NO | +| 4.1.x | NO | NO | YES | YES | YES | YES | NO | +| 4.0.x | NO | NO | YES | YES | YES | YES | NO | +| 3.4.x | YES | YES | YES | YES | Partially | N/A | NO +| 3.3.x | YES | YES | YES | YES | NO | NO | NO | +| 3.2.x | YES | YES | YES | YES | NO | NO | NO | +| 3.1.x | YES | YES | YES | YES | NO | NO | NO | +| 3.0.x | YES | YES | YES | YES | NO | NO | NO | +| 2.7.x | YES | YES | NO | NO | NO | NO | NO | NOTE: Starting 4.0.0 release, the default `spark-nlp` and `spark-nlp-gpu` packages are based on Scala 2.12.15 and Apache Spark 3.2 by default. @@ -266,7 +265,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: +Spark NLP 4.4.2 has been tested and is compatible with the following runtimes: **CPU:** @@ -298,6 +297,8 @@ Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: - 12.1 ML - 12.2 - 12.2 ML +- 13.0 +- 13.0 ML **GPU:** @@ -314,13 +315,14 @@ Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: - 12.0 ML & GPU - 12.1 ML & GPU - 12.2 ML & GPU +- 13.0 ML & GPU NOTE: Spark NLP 4.x is based on TensorFlow 2.7.x which is compatible with CUDA11 and cuDNN 8.0.2. The only Databricks runtimes supporting CUDA 11 are 9.x and above as listed under GPU. ## EMR Support -Spark NLP 4.4.1 has been tested and is compatible with the following EMR releases: +Spark NLP 4.4.2 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -343,10 +345,10 @@ NOTE: The EMR 6.1.0 and 6.1.1 are not supported. This is a cheatsheet for corresponding Spark NLP Maven package to Apache Spark / PySpark major version: -| Apache Spark | Spark NLP on CPU | Spark NLP on GPU | Spark NLP on AArch64 (linux) | Spark NLP on Apple Silicon | -|-----------------|--------------------|----------------------------|--------------------------------|--------------------------------------| -| 3.0/3.1/3.2/3.3 | `spark-nlp` | `spark-nlp-gpu` | `spark-nlp-aarch64` | `spark-nlp-silicon` | -| Start Function | `sparknlp.start()` | `sparknlp.start(gpu=True)` | `sparknlp.start(aarch64=True)` | `sparknlp.start(apple_silicon=True)` | +| Apache Spark | Spark NLP on CPU | Spark NLP on GPU | Spark NLP on AArch64 (linux) | Spark NLP on Apple Silicon | +|---------------------|--------------------|----------------------------|--------------------------------|--------------------------------------| +| 3.0/3.1/3.2/3.3/3.4 | `spark-nlp` | `spark-nlp-gpu` | `spark-nlp-aarch64` | `spark-nlp-silicon` | +| Start Function | `sparknlp.start()` | `sparknlp.start(gpu=True)` | `sparknlp.start(aarch64=True)` | `sparknlp.start(apple_silicon=True)` | NOTE: `M1/M2` and `AArch64` are under `experimental` support. Access and support to these architectures are limited by the community and we had to build most of the dependencies by ourselves to make them compatible. We support these two @@ -356,19 +358,18 @@ architectures, however, they may not work in some environments. ### Command line (requires internet connection) -Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, Apache Spark 3.2.x, and Apache Spark -3.3.x. +Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, Apache Spark 3.2.x, Apache Spark 3.3.x, and Apache Spark 3.4.x -#### Apache Spark 3.x (3.0.x, 3.1.x, 3.2.x, and 3.3.x - Scala 2.12) +#### Apache Spark 3.x (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x - Scala 2.12) ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` The `spark-nlp` has been published to @@ -377,11 +378,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2 ``` @@ -391,11 +392,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.2 ``` @@ -405,11 +406,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.2 ``` @@ -423,25 +424,25 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` ## Scala -Spark NLP supports Scala 2.12.15 if you are using Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x versions. Our packages are +Spark NLP supports Scala 2.12.15 if you are using Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x versions. Our packages are deployed to Maven central. To add any of our packages as a dependency in your application you can follow these coordinates: ### Maven -**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x: +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x: ```xml com.johnsnowlabs.nlp spark-nlp_2.12 - 4.4.1 + 4.4.2 ``` @@ -452,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 4.4.1 + 4.4.2 ``` @@ -463,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 4.4.1 + 4.4.2 ``` @@ -474,38 +475,38 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.1 + 4.4.2 ``` ### SBT -**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x: +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.2" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.2" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.2" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.2" ``` Maven @@ -527,7 +528,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==4.4.1 +pip install spark-nlp==4.4.2 ``` Conda: @@ -556,7 +557,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2") .getOrCreate() ``` @@ -588,19 +589,19 @@ result = pipeline.annotate('The Mona Lisa is a 16th century oil painting created #### spark-nlp -- FAT-JAR for CPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x +- FAT-JAR for CPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```bash sbt assembly ``` -- FAT-JAR for GPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x +- FAT-JAR for GPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```bash sbt -Dis_gpu=true assembly ``` -- FAT-JAR for M! on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x +- FAT-JAR for M! on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```bash sbt -Dis_silicon=true assembly @@ -627,7 +628,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -638,7 +639,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==4.4.1 +pip install spark-nlp==4.4.2 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -666,7 +667,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -683,7 +684,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -710,7 +711,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -733,7 +734,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.2 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -752,9 +753,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==4.4.1` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==4.4.2` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -805,7 +806,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2" } }] ``` @@ -814,7 +815,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 4.4.1" \ +--name "Spark NLP 4.4.2" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -878,7 +879,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -917,7 +918,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2") .getOrCreate() ``` @@ -931,7 +932,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` **pyspark:** @@ -944,7 +945,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` **Databricks:** @@ -1216,16 +1217,16 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.1.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.2.jar") .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark - version (3.0.x, 3.1.x, 3.2.x, and 3.3.x) + version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.1.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.2.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index 21e8ce870e1186..e2d7f4dd78690b 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "4.4.1" +version := "4.4.2" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/en/concepts.md b/docs/en/concepts.md index b06559e9b6fbdd..b363d0f9cab795 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -62,7 +62,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index b0416da43b4037..0ba39aa0daf615 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -16,7 +16,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 ``` ## Google Colab Notebook @@ -36,7 +36,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 4.4.1 +!bash colab.sh -p 3.2.3 -s 4.4.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index 8dc66410972013..612bb67bc63123 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 4.4.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 4.4.2 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index 3070b01fddc5dc..b539d71aefbfec 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -15,22 +15,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==4.4.1 +pip install spark-nlp==4.4.2 # Install Spark NLP from Anacodna/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-4.4.1.jar +spark-shell --jars spark-nlp-assembly-4.4.2.jar ``` ## Python @@ -49,7 +49,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -76,7 +76,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2")\ .getOrCreate() ``` @@ -84,14 +84,14 @@ spark = SparkSession.builder \ #### Maven -**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x: +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```xml com.johnsnowlabs.nlp spark-nlp_2.12 - 4.4.1 + 4.4.2 ``` @@ -102,7 +102,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 4.4.1 + 4.4.2 ``` @@ -113,7 +113,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.1 + 4.4.2 ``` @@ -124,38 +124,38 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 4.4.1 + 4.4.2 ``` #### SBT -**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x: +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.2" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.2" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.2" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.2" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -233,7 +233,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.1 + 4.4.2 ``` @@ -241,7 +241,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.2" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -274,7 +274,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 4.4.1, Spark NLP supports Linux systems running on an aarch64 +Starting from version 4.4.2, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -318,7 +318,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.1 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -337,7 +337,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: +Spark NLP 4.4.2 has been tested and is compatible with the following runtimes: **CPU:** @@ -403,7 +403,7 @@ NOTE: Spark NLP 4.0.x is based on TensorFlow 2.7.x which is compatible with CUDA 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -419,7 +419,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 4.4.1 has been tested and is compatible with the following EMR releases: +Spark NLP 4.4.2 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -477,7 +477,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2" } } ] @@ -487,7 +487,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 4.4.1" \ +--name "Spark NLP 4.4.2" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -741,7 +741,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2*. @@ -767,12 +767,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.1.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.2.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.1.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.2.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index 84ddf4a04a60c7..a98647a9a591d6 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 4.4.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 4.4.2 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/examples/docker/README.md b/examples/docker/README.md index 2213e7ae1042fb..16fd262a2d98f3 100644 --- a/examples/docker/README.md +++ b/examples/docker/README.md @@ -73,7 +73,7 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1" + --packages "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2" ``` To run the shell with GPU support, we use the image from [Jupyter Notebook with GPU @@ -91,5 +91,5 @@ docker run -it --name sparknlp-container \ --conf "spark.serializer"="org.apache.spark.serializer.KryoSerializer" \ --conf "spark.kryoserializer.buffer.max"="2000M" \ --conf "spark.driver.maxResultSize"="0" \ - --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1" + --packages "com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2" ``` diff --git a/examples/python/annotation/text/english/text-summarization/Text_Summarization_with_BART.ipynb b/examples/python/annotation/text/english/text-summarization/Text_Summarization_with_BART.ipynb index aa60115365bd94..495fe7b0d3facf 100644 --- a/examples/python/annotation/text/english/text-summarization/Text_Summarization_with_BART.ipynb +++ b/examples/python/annotation/text/english/text-summarization/Text_Summarization_with_BART.ipynb @@ -52,7 +52,7 @@ }, "outputs": [], "source": [ - "!pip install -q pyspark==3.3.2 spark-nlp==4.4.1" + "!pip install -q pyspark==3.3.2 spark-nlp==4.4.2" ] }, { diff --git a/examples/util/Training_Helpers.ipynb b/examples/util/Training_Helpers.ipynb index 3cc03a8a7b7d2c..5f2e7fef429e8a 100644 --- a/examples/util/Training_Helpers.ipynb +++ b/examples/util/Training_Helpers.ipynb @@ -129,7 +129,7 @@ "id": "Jn3axMFZTaxV" }, "source": [ - "Starting at spark-nlp 4.4.1, you can also set an S3 URI. To configure this, it is necessary to set up the Spark session with the appropriate settings for both Spark NLP and Spark ML." + "Starting at spark-nlp 4.4.2, you can also set an S3 URI. To configure this, it is necessary to set up the Spark session with the appropriate settings for both Spark NLP and Spark ML." ] }, { @@ -246,7 +246,7 @@ " .config(\"spark.jsl.settings.aws.credentials.secret_access_key\", MY_SECRET_KEY) \\\n", " .config(\"spark.jsl.settings.aws.credentials.session_token\", MY_SESSION_KEY) \\\n", " .config(\"spark.jsl.settings.aws.region\", \"us-east-1\") \\\n", - " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1,org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk:1.11.901\") \\\n", + " .config(\"spark.jars.packages\", \"com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2,org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk:1.11.901\") \\\n", " .config(\"spark.hadoop.fs.s3a.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\") \\\n", " .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\") \\\n", " .getOrCreate()" diff --git a/python/README.md b/python/README.md index d8e5c09c2707e2..4ec5fc96710c67 100644 --- a/python/README.md +++ b/python/README.md @@ -137,7 +137,7 @@ documentation and examples - Longformer for Question Answering - Table Question Answering (TAPAS) - Zero-Shot NER Model -- Zero Shot Text Classification by BERT (ZSL) +- Zero Shot Text Classification by Transformers (ZSL) - Neural Machine Translation (MarianMT) - Text-To-Text Transfer Transformer (Google T5) - Generative Pre-trained Transformer 2 (OpenAI GPT2) @@ -165,7 +165,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 4.4.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 4.4.2 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -181,7 +181,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -226,22 +226,21 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *4.4.1* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, and -3.3.x: - -| Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | -|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| -| 4.4.x | NO | NO | YES | YES | YES | YES | -| 4.3.x | NO | NO | YES | YES | YES | YES | -| 4.2.x | NO | NO | YES | YES | YES | YES | -| 4.1.x | NO | NO | YES | YES | YES | YES | -| 4.0.x | NO | NO | YES | YES | YES | YES | -| 3.4.x | YES | YES | YES | YES | Partially | N/A | -| 3.3.x | YES | YES | YES | YES | NO | NO | -| 3.2.x | YES | YES | YES | YES | NO | NO | -| 3.1.x | YES | YES | YES | YES | NO | NO | -| 3.0.x | YES | YES | YES | YES | NO | NO | -| 2.7.x | YES | YES | NO | NO | NO | NO | +Spark NLP *4.4.2* has been built on top of Apache Spark 3.2 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x + +| Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | +|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| +| 4.4.x | NO | NO | YES | YES | YES | YES | YES | +| 4.3.x | NO | NO | YES | YES | YES | YES | NO | +| 4.2.x | NO | NO | YES | YES | YES | YES | NO | +| 4.1.x | NO | NO | YES | YES | YES | YES | NO | +| 4.0.x | NO | NO | YES | YES | YES | YES | NO | +| 3.4.x | YES | YES | YES | YES | Partially | N/A | NO +| 3.3.x | YES | YES | YES | YES | NO | NO | NO | +| 3.2.x | YES | YES | YES | YES | NO | NO | NO | +| 3.1.x | YES | YES | YES | YES | NO | NO | NO | +| 3.0.x | YES | YES | YES | YES | NO | NO | NO | +| 2.7.x | YES | YES | NO | NO | NO | NO | NO | NOTE: Starting 4.0.0 release, the default `spark-nlp` and `spark-nlp-gpu` packages are based on Scala 2.12.15 and Apache Spark 3.2 by default. @@ -266,7 +265,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: +Spark NLP 4.4.2 has been tested and is compatible with the following runtimes: **CPU:** @@ -298,6 +297,8 @@ Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: - 12.1 ML - 12.2 - 12.2 ML +- 13.0 +- 13.0 ML **GPU:** @@ -314,13 +315,14 @@ Spark NLP 4.4.1 has been tested and is compatible with the following runtimes: - 12.0 ML & GPU - 12.1 ML & GPU - 12.2 ML & GPU +- 13.0 ML & GPU NOTE: Spark NLP 4.x is based on TensorFlow 2.7.x which is compatible with CUDA11 and cuDNN 8.0.2. The only Databricks runtimes supporting CUDA 11 are 9.x and above as listed under GPU. ## EMR Support -Spark NLP 4.4.1 has been tested and is compatible with the following EMR releases: +Spark NLP 4.4.2 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -343,10 +345,10 @@ NOTE: The EMR 6.1.0 and 6.1.1 are not supported. This is a cheatsheet for corresponding Spark NLP Maven package to Apache Spark / PySpark major version: -| Apache Spark | Spark NLP on CPU | Spark NLP on GPU | Spark NLP on AArch64 (linux) | Spark NLP on Apple Silicon | -|-----------------|--------------------|----------------------------|--------------------------------|--------------------------------------| -| 3.0/3.1/3.2/3.3 | `spark-nlp` | `spark-nlp-gpu` | `spark-nlp-aarch64` | `spark-nlp-silicon` | -| Start Function | `sparknlp.start()` | `sparknlp.start(gpu=True)` | `sparknlp.start(aarch64=True)` | `sparknlp.start(apple_silicon=True)` | +| Apache Spark | Spark NLP on CPU | Spark NLP on GPU | Spark NLP on AArch64 (linux) | Spark NLP on Apple Silicon | +|---------------------|--------------------|----------------------------|--------------------------------|--------------------------------------| +| 3.0/3.1/3.2/3.3/3.4 | `spark-nlp` | `spark-nlp-gpu` | `spark-nlp-aarch64` | `spark-nlp-silicon` | +| Start Function | `sparknlp.start()` | `sparknlp.start(gpu=True)` | `sparknlp.start(aarch64=True)` | `sparknlp.start(apple_silicon=True)` | NOTE: `M1/M2` and `AArch64` are under `experimental` support. Access and support to these architectures are limited by the community and we had to build most of the dependencies by ourselves to make them compatible. We support these two @@ -356,19 +358,18 @@ architectures, however, they may not work in some environments. ### Command line (requires internet connection) -Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, Apache Spark 3.2.x, and Apache Spark -3.3.x. +Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, Apache Spark 3.2.x, Apache Spark 3.3.x, and Apache Spark 3.4.x -#### Apache Spark 3.x (3.0.x, 3.1.x, 3.2.x, and 3.3.x - Scala 2.12) +#### Apache Spark 3.x (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x - Scala 2.12) ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` The `spark-nlp` has been published to @@ -377,11 +378,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:4.4.2 ``` @@ -391,11 +392,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:4.4.2 ``` @@ -405,11 +406,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:4.4.2 ``` @@ -423,25 +424,25 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` ## Scala -Spark NLP supports Scala 2.12.15 if you are using Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x versions. Our packages are +Spark NLP supports Scala 2.12.15 if you are using Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x versions. Our packages are deployed to Maven central. To add any of our packages as a dependency in your application you can follow these coordinates: ### Maven -**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x: +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x: ```xml com.johnsnowlabs.nlp spark-nlp_2.12 - 4.4.1 + 4.4.2 ``` @@ -452,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 4.4.1 + 4.4.2 ``` @@ -463,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 4.4.1 + 4.4.2 ``` @@ -474,38 +475,38 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 4.4.1 + 4.4.2 ``` ### SBT -**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x: +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "4.4.2" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "4.4.2" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "4.4.2" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "4.4.2" ``` Maven @@ -527,7 +528,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==4.4.1 +pip install spark-nlp==4.4.2 ``` Conda: @@ -556,7 +557,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2") .getOrCreate() ``` @@ -588,19 +589,19 @@ result = pipeline.annotate('The Mona Lisa is a 16th century oil painting created #### spark-nlp -- FAT-JAR for CPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x +- FAT-JAR for CPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```bash sbt assembly ``` -- FAT-JAR for GPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x +- FAT-JAR for GPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```bash sbt -Dis_gpu=true assembly ``` -- FAT-JAR for M! on Apache Spark 3.0.x, 3.1.x, 3.2.x, and 3.3.x +- FAT-JAR for M! on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x ```bash sbt -Dis_silicon=true assembly @@ -627,7 +628,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -638,7 +639,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==4.4.1 +pip install spark-nlp==4.4.2 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -666,7 +667,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==4.4.1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==4.4.2 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -683,7 +684,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -710,7 +711,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -733,7 +734,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 4.4.2 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -752,9 +753,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==4.4.1` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==4.4.2` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -805,7 +806,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2" } }] ``` @@ -814,7 +815,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 4.4.1" \ +--name "Spark NLP 4.4.2" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -878,7 +879,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -917,7 +918,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2") .getOrCreate() ``` @@ -931,7 +932,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` **pyspark:** @@ -944,7 +945,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:4.4.2 ``` **Databricks:** @@ -1216,16 +1217,16 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.1.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-4.4.2.jar") .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark - version (3.0.x, 3.1.x, 3.2.x, and 3.3.x) + version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.1.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-4.4.2.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index fbe20d67238696..8e70da87cb682f 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "4.4.1" +release = "4.4.2" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index 8ab8ff23534840..1bb7fb062d8452 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='4.4.1', # Required + version='4.4.2', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index bc299f860e47b4..31401e31c7744e 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "4.4.1" + current_version = "4.4.2" if params is None: params = {} @@ -298,4 +298,4 @@ def version(): str The current Spark NLP version. """ - return '4.4.1' + return '4.4.2' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index 0040d36711dd17..28b96ea4e5a3a2 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="4.4.1" +SPARKNLP="4.4.2" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index 79b290eabcec83..4d39e62b36a089 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="4.4.1" +SPARKNLP="4.4.2" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index d2431ae1b308d5..44c3c557bbcbb2 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="4.4.1" +SPARKNLP="4.4.2" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index aa7a3fa16cf88d..ab1834599276bd 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "4.4.1" + val currentVersion = "4.4.2" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 6d7725d27b685a..64845ab88fdaa9 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "4.4.1" + val version: String = "4.4.2" } From 47e2f3db40b288bd3114d7ac7fc6b7a78b7e9765 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 10 May 2023 17:35:10 +0200 Subject: [PATCH 27/32] Disabling spark 30 and 31 GA jobs [run doc] - we are hitting the rate limit on GA - jobs are failing with the new spark 34 tests - the spark 30 and 31 are old now with security issues, we can for now disable them --- .github/workflows/build_and_test.yml | 136 +++++++++++++-------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 07868e9b9ddaf8..a1afb5b4117065 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -144,74 +144,74 @@ jobs: cd python python3.7 -m pytest -v -m fast - spark31: - if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" - runs-on: macos-latest - env: - TF_CPP_MIN_LOG_LEVEL: 3 - JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.1.x + # spark31: + # if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" + # runs-on: macos-latest + # env: + # TF_CPP_MIN_LOG_LEVEL: 3 + # JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" + # name: Build and Test on Apache Spark 3.1.x - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'adopt' - java-version: '8' - cache: 'sbt' - - name: Install Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: 3.7.7 - architecture: x64 - - name: Install Python packages (Python 3.7) - run: | - python -m pip install --upgrade pip - pip install pyspark==3.1.3 numpy pytest - - name: Build Spark NLP on Apache Spark 3.1.x - run: | - brew install sbt - sbt -mem 4096 -Dis_spark31=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.1.x - run: | - sbt -mem 4096 test - - name: Test Spark NLP in Python - Apache Spark 3.1.x - run: | - cd python - python3.7 -m pytest -v -m fast + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-java@v3 + # with: + # distribution: 'adopt' + # java-version: '8' + # cache: 'sbt' + # - name: Install Python 3.7 + # uses: actions/setup-python@v2 + # with: + # python-version: 3.7.7 + # architecture: x64 + # - name: Install Python packages (Python 3.7) + # run: | + # python -m pip install --upgrade pip + # pip install pyspark==3.1.3 numpy pytest + # - name: Build Spark NLP on Apache Spark 3.1.x + # run: | + # brew install sbt + # sbt -mem 4096 -Dis_spark31=true clean assemblyAndCopy + # - name: Test Spark NLP in Scala - Apache Spark 3.1.x + # run: | + # sbt -mem 4096 test + # - name: Test Spark NLP in Python - Apache Spark 3.1.x + # run: | + # cd python + # python3.7 -m pytest -v -m fast - spark30: - if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" - runs-on: macos-latest - env: - TF_CPP_MIN_LOG_LEVEL: 3 - JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.0.x + # spark30: + # if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" + # runs-on: macos-latest + # env: + # TF_CPP_MIN_LOG_LEVEL: 3 + # JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" + # name: Build and Test on Apache Spark 3.0.x - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'adopt' - java-version: '8' - cache: 'sbt' - - name: Install Python 3.7 - uses: actions/setup-python@v2 - with: - python-version: 3.7.7 - architecture: x64 - - name: Install Python packages (Python 3.7) - run: | - python -m pip install --upgrade pip - pip install pyspark==3.0.3 numpy pytest - - name: Build Spark NLP on Apache Spark 3.0.x - run: | - brew install sbt - sbt -mem 4096 -Dis_spark30=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.0.x - run: | - sbt -mem 4096 test - - name: Test Spark NLP in Python - Apache Spark 3.0.x - run: | - cd python - python3.7 -m pytest -v -m fast \ No newline at end of file + # steps: + # - uses: actions/checkout@v3 + # - uses: actions/setup-java@v3 + # with: + # distribution: 'adopt' + # java-version: '8' + # cache: 'sbt' + # - name: Install Python 3.7 + # uses: actions/setup-python@v2 + # with: + # python-version: 3.7.7 + # architecture: x64 + # - name: Install Python packages (Python 3.7) + # run: | + # python -m pip install --upgrade pip + # pip install pyspark==3.0.3 numpy pytest + # - name: Build Spark NLP on Apache Spark 3.0.x + # run: | + # brew install sbt + # sbt -mem 4096 -Dis_spark30=true clean assemblyAndCopy + # - name: Test Spark NLP in Scala - Apache Spark 3.0.x + # run: | + # sbt -mem 4096 test + # - name: Test Spark NLP in Python - Apache Spark 3.0.x + # run: | + # cd python + # python3.7 -m pytest -v -m fast \ No newline at end of file From 7cf1bd2b08bfb5d625fc40854d35e0827c5df3fb Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Thu, 11 May 2023 01:39:50 +1000 Subject: [PATCH 28/32] Updated model input and output signature to use ModelSignatureConstants (#13790) --- .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 104 +++++++++++------- .../sign/ModelSignatureConstants.scala | 75 +++++++++++++ 2 files changed, 140 insertions(+), 39 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index e0c5d771c2b3a0..cb46d348450483 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -17,7 +17,7 @@ package com.johnsnowlabs.ml.ai import com.johnsnowlabs.ml.ai.util.Generation.Generate -import com.johnsnowlabs.ml.tensorflow.sign.ModelSignatureManager +import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} import com.johnsnowlabs.nlp.annotators.common.SentenceSplit import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BartTokenizer, BpeTokenizer} @@ -53,27 +53,6 @@ private[johnsnowlabs] class Bart( private val paddingTokenId = 1 private val eosTokenId = 2 private val vocabSize = 50264 - private val encoderInputIdsKey = "encoder_encoder_input_ids:0" - private val encoderAttentionMaskKey = "encoder_encoder_attention_mask:0" - private val encoderOutputKey = "StatefulPartitionedCall_2:0" - - private val decoderInitInputIdsKey = "decoder_init_decoder_input_ids:0" - private val decoderInitEncoderAttentionMaskKey = "decoder_init_encoder_attention_mask:0" - private val decoderInitEncoderStateKey = "decoder_init_encoder_state:0" - - private val decoderInitOutputLogitsKey = "StatefulPartitionedCall_1:2" - private val decoderInitOutputCache1Key = "StatefulPartitionedCall_1:0" - private val decoderInitOutputCache2Key = "StatefulPartitionedCall_1:1" - - private val decoderCachedInputIdsKey = "decoder_cached_decoder_input_ids:0" - private val decoderCachedEncoderAttentionMaskKey = "decoder_cached_encoder_attention_mask:0" - private val decoderCachedEncoderStateKey = "decoder_cached_encoder_state:0" - private val decoderCachedCache1Key = "decoder_cached_cache1:0" - private val decoderCachedCache2Key = "decoder_cached_cache2:0" - - private val decoderCachedOutputLogitsKey = "StatefulPartitionedCall:2" - private val decoderCachedOutputCache1Key = "StatefulPartitionedCall:0" - private val decoderCachedOutputCache2Key = "StatefulPartitionedCall:1" var tensorDecoder = new TensorResources() private var nextStateTensor1: Option[org.tensorflow.Tensor] = None private var nextStateTensor2: Option[org.tensorflow.Tensor] = None @@ -250,9 +229,18 @@ private[johnsnowlabs] class Bart( val runner = session.runner runner - .feed(encoderInputIdsKey, encoderInputTensors) - .feed(encoderAttentionMaskKey, encoderAttentionMaskTensors) - .fetch(encoderOutputKey) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.EncoderInputIds.key, + "missing_encoder_input_ids"), + encoderInputTensors) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.EncoderAttentionMask.key, + "missing_encoder_attention_mask"), + encoderAttentionMaskTensors) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.CachedEncoderOutput.key, "missing_last_hidden_state")) val encoderOuts = runner.run().asScala val encoderOutsFloats = TensorResources.extractFloats(encoderOuts.head) @@ -399,27 +387,65 @@ private[johnsnowlabs] class Bart( val runner = if (nextStateTensor1.isEmpty || nextStateTensor2.isEmpty) { val r = session.runner - .feed(decoderInitInputIdsKey, decoderInputTensors) - .feed(decoderInitEncoderStateKey, decoderEncoderStateTensors) - .feed(decoderInitEncoderAttentionMaskKey, encoderAttentionMaskTensors) - .fetch(decoderInitOutputLogitsKey) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.InitDecoderInputIds.key, + "missing_decoder_input_ids_init"), + decoderInputTensors) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.InitDecoderEncoderInputIds.key, + "missing_encoder_state_init"), + decoderEncoderStateTensors) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.InitDecoderEncoderAttentionMask.key, + "missing_decoder_encoder_attention_mask_init"), + encoderAttentionMaskTensors) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.InitLogitsOutput.key, "missing_logits_init")) if (!useCache) r else r - .fetch(decoderInitOutputCache1Key) - .fetch(decoderInitOutputCache2Key) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.InitCachedOutput1.key, "missing_cache1_out_init")) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.InitCachedOutPut2.key, "missing_cache2_out_init")) } else { session.runner - .feed(decoderCachedInputIdsKey, decoderInputTensors) - .feed(decoderCachedEncoderStateKey, decoderEncoderStateTensors) - .feed(decoderCachedEncoderAttentionMaskKey, encoderAttentionMaskTensors) - .feed(decoderCachedCache1Key, nextStateTensor1.get) - .feed(decoderCachedCache2Key, nextStateTensor2.get) - .fetch(decoderCachedOutputLogitsKey) - .fetch(decoderCachedOutputCache1Key) - .fetch(decoderCachedOutputCache2Key) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.CachedDecoderInputIds.key, + "missing_decoder_input_ids"), + decoderInputTensors) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.CachedDecoderEncoderInputIds.key, + "missing_encoder_state"), + decoderEncoderStateTensors) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.CachedDecoderEncoderAttentionMask.key, + "missing_decoder_encoder_attention_mask"), + encoderAttentionMaskTensors) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.CachedDecoderInputCache1.key, + "missing_decoder_input_cache1"), + nextStateTensor1.get) + .feed( + _tfBartSignatures.getOrElse( + ModelSignatureConstants.CachedDecoderInputCache2.key, + "missing_decoder_input_cache2"), + nextStateTensor2.get) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.CachedLogitsOutput.key, "missing_logits_out")) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.CachedOutput1.key, "missing_cache1_out")) + .fetch(_tfBartSignatures + .getOrElse(ModelSignatureConstants.CachedOutPut2.key, "missing_cache2_out")) } val decoderOuts = runner.run().asScala diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala index 8a6c864b0d14bb..383c6b8751582a 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants.scala @@ -198,6 +198,81 @@ object ModelSignatureConstants { override val value: String = "input_values:0" } + case object CachedEncoderOutput extends TFInfoNameMapper { + override val key: String = "last_hidden_state" + override val value: String = "StatefulPartitionedCall_2:0" + } + + case object CachedDecoderInputIds extends TFInfoNameMapper { + override val key: String = "decoder_input_ids" + override val value: String = "decoder_cached_decoder_input_ids:0" + } + + case object CachedDecoderEncoderInputIds extends TFInfoNameMapper { + override val key: String = "encoder_state" + override val value: String = "decoder_cached_encoder_state:0" + } + + case object CachedDecoderEncoderAttentionMask extends TFInfoNameMapper { + override val key: String = "decoder_encoder_attention_mask" + override val value: String = "decoder_cached_decoder_encoder_attention_mask:0" + } + + case object CachedDecoderInputCache1 extends TFInfoNameMapper { + override val key: String = "cache1" + override val value: String = "decoder_cached_cache1:0" + } + + case object CachedDecoderInputCache2 extends TFInfoNameMapper { + override val key: String = "cache2" + override val value: String = "decoder_cached_cache2:0" + } + + case object CachedOutput1 extends TFInfoNameMapper { + override val key: String = "cache1_out" + override val value: String = "StatefulPartitionedCall:0" + } + + case object CachedOutPut2 extends TFInfoNameMapper { + override val key: String = "cache2_out" + override val value: String = "StatefulPartitionedCall:1" + } + + case object CachedLogitsOutput extends TFInfoNameMapper { + override val key: String = "logits" + override val value: String = "StatefulPartitionedCall:2" + } + + case object InitDecoderInputIds extends TFInfoNameMapper { + override val key: String = "decoder_input_ids_init" + override val value: String = "decoder_init_decoder_input_ids_init:0" + } + + case object InitDecoderEncoderInputIds extends TFInfoNameMapper { + override val key: String = "encoder_state_init" + override val value: String = "decoder_init_encoder_state_init:0" + } + + case object InitDecoderEncoderAttentionMask extends TFInfoNameMapper { + override val key: String = "decoder_encoder_attention_mask_init" + override val value: String = "decoder_init_decoder_encoder_attention_mask_init:0" + } + + case object InitCachedOutput1 extends TFInfoNameMapper { + override val key: String = "cache1_out_init" + override val value: String = "StatefulPartitionedCall_1:0" + } + + case object InitCachedOutPut2 extends TFInfoNameMapper { + override val key: String = "cache2_out_init" + override val value: String = "StatefulPartitionedCall_1:1" + } + + case object InitLogitsOutput extends TFInfoNameMapper { + override val key: String = "logits_init" + override val value: String = "StatefulPartitionedCall_1:2" + } + /** Retrieve signature patterns for a given provider * * @param modelProvider From e78336390fe1a546941928e27c8f61d4c9105bf5 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 10 May 2023 18:40:38 +0200 Subject: [PATCH 29/32] Update CHANGELOG [run doc] --- CHANGELOG | 20 ++++++++++++++++++++ docs/en/install.md | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index a8540686c55690..bf1a7d04277e70 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,23 @@ +======== +4.4.2 +======== +---------------- +New Features & Enhancements +---------------- +* InImplement a new Zero-Shot Text Classification for RoBERTa annotator called `RobertaForZeroShotClassification` +* Support Apache Spark 3.4 +* Omptize BART models for memory efficiency +* Introducing `cache` feature in BartTransformer +* Improve error handling for max sequence length for transformers in Python +* Improve `MultiDateMatcher` annotator to return multiple dates + +---------------- +Bug Fixes +---------------- +* Fix a bug in Tapas due to exceeding the maximum rank value +* Fix loading Transformer models via loadSavedModel() method from DBFS on Databricks + + ======== 4.4.1 ======== diff --git a/docs/en/install.md b/docs/en/install.md index b539d71aefbfec..460df8b345a361 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -5,7 +5,7 @@ seotitle: Spark NLP - Installation title: Spark NLP - Installation permalink: /docs/en/install key: docs-install -modify_date: "2022-12-21" +modify_date: "2023-05-10" show_nav: true sidebar: nav: sparknlp From a3e70d2a84af85ed31560834417893a53c92ff4a Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 10 May 2023 16:51:32 +0000 Subject: [PATCH 30/32] Update Scala and Python APIs --- docs/api/com/index.html | 8 +- .../johnsnowlabs/client/CredentialParams.html | 8 +- .../client/aws/AWSAnonymousCredentials.html | 8 +- .../client/aws/AWSBasicCredentials.html | 8 +- .../client/aws/AWSCredentialsProvider.html | 8 +- .../johnsnowlabs/client/aws/AWSGateway.html | 8 +- .../client/aws/AWSProfileCredentials.html | 8 +- .../client/aws/AWSTokenCredentials.html | 8 +- .../johnsnowlabs/client/aws/Credentials.html | 8 +- .../com/johnsnowlabs/client/aws/index.html | 8 +- .../johnsnowlabs/client/gcp/GCPGateway.html | 8 +- .../com/johnsnowlabs/client/gcp/index.html | 8 +- docs/api/com/johnsnowlabs/client/index.html | 8 +- .../johnsnowlabs/collections/SearchTrie$.html | 8 +- .../johnsnowlabs/collections/SearchTrie.html | 8 +- .../collections/StorageSearchTrie$.html | 8 +- .../collections/StorageSearchTrie.html | 8 +- .../com/johnsnowlabs/collections/index.html | 8 +- docs/api/com/johnsnowlabs/index.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/DeBerta.html | 8 +- .../ml/ai/MergeTokenStrategy$.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/index.html | 8 +- .../ml/ai/util/Generation/Generate.html | 88 +- .../ml/ai/util/Generation/Logit/Logit.html | 8 +- .../Logit/LogitProcess/LogitProcessor.html | 8 +- .../LogitProcess/MinLengthLogitProcessor.html | 8 +- .../NoRepeatNgramsLogitProcessor.html | 8 +- .../RepetitionPenaltyLogitProcessor.html | 8 +- .../Generation/Logit/LogitProcess/index.html | 8 +- .../Generation/Logit/LogitProcessorList.html | 8 +- .../Logit/LogitWarper/LogitWarper.html | 8 +- .../LogitWarper/TemperatureLogitWarper.html | 8 +- .../Logit/LogitWarper/TopKLogitWarper.html | 8 +- .../Logit/LogitWarper/TopPLogitWarper.html | 8 +- .../Generation/Logit/LogitWarper/index.html | 8 +- .../ml/ai/util/Generation/Logit/index.html | 8 +- .../Generation/Search/BeamHypotheses.html | 8 +- .../ai/util/Generation/Search/BeamScorer.html | 8 +- .../Generation/Search/BeamSearchScorer.html | 8 +- .../ml/ai/util/Generation/Search/index.html | 8 +- .../ml/ai/util/Generation/index.html | 8 +- .../com/johnsnowlabs/ml/ai/util/index.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/Attr.html | 8 +- .../com/johnsnowlabs/ml/crf/AttrFeature.html | 8 +- .../api/com/johnsnowlabs/ml/crf/AttrStat.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfDataset.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfParams.html | 8 +- .../johnsnowlabs/ml/crf/DatasetEncoder.html | 8 +- .../johnsnowlabs/ml/crf/DatasetMetadata.html | 8 +- .../johnsnowlabs/ml/crf/DatasetReader$.html | 8 +- .../johnsnowlabs/ml/crf/EdgeCalculator$.html | 8 +- .../com/johnsnowlabs/ml/crf/FbCalculator.html | 8 +- .../api/com/johnsnowlabs/ml/crf/Instance.html | 8 +- .../johnsnowlabs/ml/crf/InstanceLabels.html | 8 +- .../johnsnowlabs/ml/crf/L2DecayStrategy.html | 8 +- .../johnsnowlabs/ml/crf/LinearChainCrf.html | 8 +- .../ml/crf/LinearChainCrfModel.html | 8 +- .../ml/crf/SerializedDatasetMetadata.html | 8 +- .../ml/crf/SerializedLinearChainCrfModel.html | 8 +- .../ml/crf/SparseArray$$SeqWrapper.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray$.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray.html | 8 +- .../ml/crf/TextSentenceAttrs.html | 8 +- .../ml/crf/TextSentenceLabels.html | 8 +- .../com/johnsnowlabs/ml/crf/Transition.html | 8 +- .../com/johnsnowlabs/ml/crf/VectorMath$.html | 8 +- .../com/johnsnowlabs/ml/crf/WordAttrs.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/index.html | 8 +- docs/api/com/johnsnowlabs/ml/index.html | 8 +- .../tensorflow/ClassifierDatasetEncoder.html | 8 +- .../ClassifierDatasetEncoderParams.html | 8 +- .../ml/tensorflow/DatasetEncoderParams.html | 8 +- .../johnsnowlabs/ml/tensorflow/Logging.html | 8 +- .../ml/tensorflow/ModelSignature.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch$.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch.html | 8 +- .../ml/tensorflow/NerDatasetEncoder.html | 8 +- .../ml/tensorflow/ReadTensorflowModel.html | 10 +- .../ml/tensorflow/SentenceGrouper.html | 8 +- .../ml/tensorflow/TensorResources$.html | 8 +- .../ml/tensorflow/TensorResources.html | 8 +- .../ml/tensorflow/TensorflowClassifier.html | 8 +- .../ml/tensorflow/TensorflowWrapper$.html | 8 +- .../ml/tensorflow/TensorflowWrapper.html | 8 +- .../johnsnowlabs/ml/tensorflow/Variables.html | 8 +- .../ml/tensorflow/WriteTensorflowModel.html | 10 +- .../com/johnsnowlabs/ml/tensorflow/index.html | 8 +- .../sentencepiece/ReadSentencePieceModel.html | 8 +- .../sentencepiece/SentencePieceException.html | 8 +- .../sentencepiece/SentencePieceProcessor.html | 8 +- .../sentencepiece/SentencePieceWrapper$.html | 8 +- .../WriteSentencePieceModel.html | 8 +- .../ml/tensorflow/sentencepiece/index.html | 8 +- ...delSignatureConstants$$AttentionMask$.html | 68 +- ...lSignatureConstants$$AttentionMaskV1$.html | 68 +- ...SignatureConstants$$AudioValuesInput$.html | 68 +- ...s$$CachedDecoderEncoderAttentionMask$.html | 840 ++++ ...stants$$CachedDecoderEncoderInputIds$.html | 840 ++++ ...eConstants$$CachedDecoderInputCache1$.html | 840 ++++ ...eConstants$$CachedDecoderInputCache2$.html | 840 ++++ ...tureConstants$$CachedDecoderInputIds$.html | 840 ++++ ...natureConstants$$CachedEncoderOutput$.html | 840 ++++ ...gnatureConstants$$CachedLogitsOutput$.html | 840 ++++ ...delSignatureConstants$$CachedOutPut2$.html | 840 ++++ ...delSignatureConstants$$CachedOutput1$.html | 840 ++++ .../sign/ModelSignatureConstants$$DType$.html | 68 +- ...atureConstants$$DecoderAttentionMask$.html | 68 +- ...nstants$$DecoderEncoderAttentionMask$.html | 68 +- ...ureConstants$$DecoderEncoderInputIds$.html | 68 +- ...lSignatureConstants$$DecoderInputIds$.html | 68 +- ...delSignatureConstants$$DecoderOutput$.html | 68 +- .../ModelSignatureConstants$$DimCount$.html | 68 +- ...atureConstants$$EncoderAttentionMask$.html | 68 +- ...lSignatureConstants$$EncoderInputIds$.html | 68 +- ...delSignatureConstants$$EncoderOutput$.html | 68 +- ...lSignatureConstants$$EndLogitsOutput$.html | 68 +- ...ignatureConstants$$InitCachedOutPut2$.html | 840 ++++ ...ignatureConstants$$InitCachedOutput1$.html | 840 ++++ ...nts$$InitDecoderEncoderAttentionMask$.html | 840 ++++ ...onstants$$InitDecoderEncoderInputIds$.html | 840 ++++ ...natureConstants$$InitDecoderInputIds$.html | 840 ++++ ...SignatureConstants$$InitLogitsOutput$.html | 840 ++++ .../ModelSignatureConstants$$InputIds$.html | 68 +- .../ModelSignatureConstants$$InputIdsV1$.html | 68 +- ...lSignatureConstants$$LastHiddenState$.html | 68 +- ...ignatureConstants$$LastHiddenStateV1$.html | 68 +- ...odelSignatureConstants$$LogitsOutput$.html | 68 +- .../sign/ModelSignatureConstants$$Name$.html | 68 +- ...SignatureConstants$$PixelValuesInput$.html | 68 +- ...odelSignatureConstants$$PoolerOutput$.html | 68 +- ...elSignatureConstants$$PoolerOutputV1$.html | 68 +- ...elSignatureConstants$$SerializedSize$.html | 68 +- ...odelSignatureConstants$$ShapeDimList$.html | 68 +- ...ignatureConstants$$StartLogitsOutput$.html | 68 +- ...lSignatureConstants$$TFInfoDescriptor.html | 68 +- ...lSignatureConstants$$TFInfoNameMapper.html | 70 +- ...stants$$TapasLogitsAggregationOutput$.html | 68 +- ...ignatureConstants$$TapasLogitsOutput$.html | 68 +- ...odelSignatureConstants$$TokenTypeIds$.html | 68 +- ...elSignatureConstants$$TokenTypeIdsV1$.html | 68 +- .../sign/ModelSignatureConstants$.html | 248 +- .../sign/ModelSignatureManager$.html | 8 +- .../ml/tensorflow/sign/index.html | 8 +- .../ml/util/LoadExternalModel$.html | 8 +- .../johnsnowlabs/ml/util/ModelEngine$.html | 8 +- docs/api/com/johnsnowlabs/ml/util/index.html | 8 +- .../johnsnowlabs/nlp/ActivationFunction$.html | 8 +- .../nlp/Annotation$$AnnotationContainer.html | 8 +- ...nnotation$$extractors$$AnnotationData.html | 8 +- .../nlp/Annotation$$extractors$.html | 8 +- .../api/com/johnsnowlabs/nlp/Annotation$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Annotation.html | 8 +- .../AnnotationAudio$$AnnotationContainer.html | 8 +- .../nlp/AnnotationAudio$$AudioFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationAudio$.html | 8 +- .../com/johnsnowlabs/nlp/AnnotationAudio.html | 8 +- .../AnnotationImage$$AnnotationContainer.html | 8 +- .../nlp/AnnotationImage$$ImageFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationImage$.html | 8 +- .../com/johnsnowlabs/nlp/AnnotationImage.html | 8 +- .../johnsnowlabs/nlp/AnnotatorApproach.html | 10 +- .../com/johnsnowlabs/nlp/AnnotatorModel.html | 12 +- .../com/johnsnowlabs/nlp/AnnotatorType$.html | 8 +- .../com/johnsnowlabs/nlp/AudioAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/AudioAssembler.html | 10 +- docs/api/com/johnsnowlabs/nlp/CanBeLazy.html | 10 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk.html | 10 +- .../johnsnowlabs/nlp/DocumentAssembler$.html | 8 +- .../johnsnowlabs/nlp/DocumentAssembler.html | 10 +- .../johnsnowlabs/nlp/EmbeddingsFinisher$.html | 8 +- .../johnsnowlabs/nlp/EmbeddingsFinisher.html | 10 +- .../com/johnsnowlabs/nlp/FeaturesReader.html | 8 +- .../com/johnsnowlabs/nlp/FeaturesWriter.html | 8 +- docs/api/com/johnsnowlabs/nlp/Finisher$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Finisher.html | 10 +- .../com/johnsnowlabs/nlp/GraphFinisher.html | 10 +- .../nlp/HasAudioFeatureProperties.html | 10 +- .../johnsnowlabs/nlp/HasBatchedAnnotate.html | 10 +- .../nlp/HasBatchedAnnotateAudio.html | 8 +- .../nlp/HasBatchedAnnotateImage.html | 8 +- .../nlp/HasCandidateLabelsProperties.html | 12 +- .../nlp/HasCaseSensitiveProperties.html | 12 +- .../HasClassifierActivationProperties.html | 12 +- .../nlp/HasEnableCachingProperties.html | 10 +- docs/api/com/johnsnowlabs/nlp/HasEngine.html | 12 +- .../api/com/johnsnowlabs/nlp/HasFeatures.html | 10 +- .../nlp/HasImageFeatureProperties.html | 10 +- .../nlp/HasInputAnnotationCols.html | 12 +- .../nlp/HasMultipleInputAnnotationCols.html | 10 +- .../nlp/HasOutputAnnotationCol.html | 12 +- .../nlp/HasOutputAnnotatorType.html | 10 +- .../com/johnsnowlabs/nlp/HasPretrained.html | 10 +- .../com/johnsnowlabs/nlp/HasRecursiveFit.html | 8 +- .../nlp/HasRecursiveTransform.html | 8 +- .../johnsnowlabs/nlp/HasSimpleAnnotate.html | 8 +- .../api/com/johnsnowlabs/nlp/IAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/ImageAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/ImageAssembler.html | 10 +- .../com/johnsnowlabs/nlp/JavaAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/LightPipeline.html | 8 +- .../nlp/MultiDocumentAssembler$.html | 8 +- .../nlp/MultiDocumentAssembler.html | 10 +- .../nlp/ParamsAndFeaturesReadable.html | 10 +- .../nlp/ParamsAndFeaturesWritable.html | 12 +- .../com/johnsnowlabs/nlp/RawAnnotator.html | 12 +- .../johnsnowlabs/nlp/RecursivePipeline.html | 10 +- .../nlp/RecursivePipelineModel.html | 10 +- docs/api/com/johnsnowlabs/nlp/SparkNLP$.html | 8 +- .../com/johnsnowlabs/nlp/TableAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/TableAssembler.html | 10 +- .../com/johnsnowlabs/nlp/TokenAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/TokenAssembler.html | 10 +- .../nlp/annotators/Chunk2Doc$.html | 8 +- .../nlp/annotators/Chunk2Doc.html | 10 +- .../nlp/annotators/ChunkTokenizer$.html | 8 +- .../nlp/annotators/ChunkTokenizer.html | 10 +- .../nlp/annotators/ChunkTokenizerModel$.html | 8 +- .../nlp/annotators/ChunkTokenizerModel.html | 10 +- .../johnsnowlabs/nlp/annotators/Chunker$.html | 8 +- .../johnsnowlabs/nlp/annotators/Chunker.html | 10 +- .../nlp/annotators/Date2Chunk$.html | 8 +- .../nlp/annotators/Date2Chunk.html | 10 +- .../nlp/annotators/DateMatcher$.html | 8 +- .../nlp/annotators/DateMatcher.html | 78 +- .../nlp/annotators/DateMatcherTranslator.html | 8 +- .../DateMatcherTranslatorPolicy.html | 8 +- .../nlp/annotators/DateMatcherUtils.html | 72 +- .../nlp/annotators/DocumentNormalizer$.html | 8 +- .../nlp/annotators/DocumentNormalizer.html | 10 +- .../nlp/annotators/EnglishStemmer$.html | 8 +- .../nlp/annotators/GraphExtraction.html | 10 +- .../nlp/annotators/Lemmatizer$.html | 8 +- .../nlp/annotators/Lemmatizer.html | 10 +- .../nlp/annotators/LemmatizerModel$.html | 8 +- .../nlp/annotators/LemmatizerModel.html | 10 +- .../nlp/annotators/LookAroundManager$.html | 8 +- .../nlp/annotators/MultiDateMatcher$.html | 8 +- .../nlp/annotators/MultiDateMatcher.html | 78 +- .../nlp/annotators/MultiDatePolicy$.html | 8 +- .../nlp/annotators/NGramGenerator$.html | 8 +- .../nlp/annotators/NGramGenerator.html | 10 +- .../nlp/annotators/Normalizer$.html | 8 +- .../nlp/annotators/Normalizer.html | 10 +- .../nlp/annotators/NormalizerModel$.html | 8 +- ...alizerModel$TokenizerAndNormalizerMap.html | 8 +- .../nlp/annotators/NormalizerModel.html | 10 +- .../annotators/PretrainedAnnotations$.html | 8 +- .../ReadablePretrainedLemmatizer.html | 8 +- ...adablePretrainedStopWordsCleanerModel.html | 8 +- .../ReadablePretrainedTextMatcher.html | 8 +- .../ReadablePretrainedTokenizer.html | 8 +- .../nlp/annotators/RecursiveTokenizer.html | 10 +- .../annotators/RecursiveTokenizerModel$.html | 8 +- .../annotators/RecursiveTokenizerModel.html | 10 +- .../nlp/annotators/RegexMatcher$.html | 8 +- .../nlp/annotators/RegexMatcher.html | 10 +- .../nlp/annotators/RegexMatcherModel$.html | 8 +- .../nlp/annotators/RegexMatcherModel.html | 10 +- .../nlp/annotators/RegexTokenizer$.html | 8 +- .../nlp/annotators/RegexTokenizer.html | 10 +- .../nlp/annotators/SingleDatePolicy$.html | 8 +- .../johnsnowlabs/nlp/annotators/Stemmer$.html | 8 +- .../johnsnowlabs/nlp/annotators/Stemmer.html | 10 +- .../nlp/annotators/StopWordsCleaner$.html | 8 +- .../nlp/annotators/StopWordsCleaner.html | 10 +- .../nlp/annotators/TextMatcher$.html | 8 +- .../nlp/annotators/TextMatcher.html | 10 +- .../nlp/annotators/TextMatcherModel$.html | 8 +- .../nlp/annotators/TextMatcherModel.html | 10 +- .../nlp/annotators/Token2Chunk$.html | 8 +- .../nlp/annotators/Token2Chunk.html | 10 +- .../nlp/annotators/Tokenizer$.html | 8 +- .../nlp/annotators/Tokenizer.html | 10 +- .../nlp/annotators/TokenizerModel$.html | 8 +- .../nlp/annotators/TokenizerModel.html | 10 +- .../nlp/annotators/audio/HubertForCTC$.html | 8 +- .../nlp/annotators/audio/HubertForCTC.html | 10 +- .../audio/ReadHubertForAudioDLModel.html | 8 +- .../audio/ReadWav2Vec2ForAudioDLModel.html | 8 +- ...ReadablePretrainedHubertForAudioModel.html | 8 +- ...adablePretrainedWav2Vec2ForAudioModel.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC$.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC.html | 10 +- .../nlp/annotators/audio/index.html | 8 +- .../nlp/annotators/btm/BigTextMatcher$.html | 8 +- .../nlp/annotators/btm/BigTextMatcher.html | 10 +- .../annotators/btm/BigTextMatcherModel$.html | 8 +- .../annotators/btm/BigTextMatcherModel.html | 10 +- .../btm/ReadablePretrainedBigTextMatcher.html | 8 +- .../nlp/annotators/btm/TMEdgesReadWriter.html | 8 +- .../nlp/annotators/btm/TMEdgesReader.html | 8 +- .../nlp/annotators/btm/TMNodesReader.html | 8 +- .../nlp/annotators/btm/TMNodesWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReadWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReader.html | 8 +- .../nlp/annotators/btm/TrieNode.html | 8 +- .../nlp/annotators/btm/index.html | 8 +- .../dl/AlbertForQuestionAnswering$.html | 20 +- .../dl/AlbertForQuestionAnswering.html | 22 +- .../dl/AlbertForSequenceClassification$.html | 20 +- .../dl/AlbertForSequenceClassification.html | 22 +- .../dl/AlbertForTokenClassification$.html | 20 +- .../dl/AlbertForTokenClassification.html | 22 +- .../dl/BertForQuestionAnswering$.html | 20 +- .../dl/BertForQuestionAnswering.html | 22 +- .../dl/BertForSequenceClassification$.html | 20 +- .../dl/BertForSequenceClassification.html | 22 +- .../dl/BertForTokenClassification$.html | 20 +- .../dl/BertForTokenClassification.html | 22 +- .../dl/BertForZeroShotClassification$.html | 20 +- .../dl/BertForZeroShotClassification.html | 22 +- .../dl/CamemBertForQuestionAnswering$.html | 20 +- .../dl/CamemBertForQuestionAnswering.html | 22 +- .../CamemBertForSequenceClassification$.html | 20 +- .../CamemBertForSequenceClassification.html | 22 +- .../dl/CamemBertForTokenClassification$.html | 20 +- .../dl/CamemBertForTokenClassification.html | 22 +- .../classifier/dl/ClassifierDLApproach$.html | 20 +- .../classifier/dl/ClassifierDLApproach.html | 22 +- .../classifier/dl/ClassifierDLModel$.html | 20 +- .../classifier/dl/ClassifierDLModel.html | 22 +- .../classifier/dl/ClassifierEncoder.html | 22 +- .../classifier/dl/ClassifierMetrics.html | 20 +- .../dl/DeBertaForQuestionAnswering$.html | 20 +- .../dl/DeBertaForQuestionAnswering.html | 22 +- .../dl/DeBertaForSequenceClassification$.html | 20 +- .../dl/DeBertaForSequenceClassification.html | 22 +- .../dl/DeBertaForTokenClassification$.html | 20 +- .../dl/DeBertaForTokenClassification.html | 22 +- .../dl/DistilBertForQuestionAnswering$.html | 20 +- .../dl/DistilBertForQuestionAnswering.html | 22 +- .../DistilBertForSequenceClassification$.html | 20 +- .../DistilBertForSequenceClassification.html | 22 +- .../dl/DistilBertForTokenClassification$.html | 20 +- .../dl/DistilBertForTokenClassification.html | 22 +- .../DistilBertForZeroShotClassification$.html | 20 +- .../DistilBertForZeroShotClassification.html | 22 +- .../dl/LongformerForQuestionAnswering$.html | 20 +- .../dl/LongformerForQuestionAnswering.html | 22 +- .../LongformerForSequenceClassification$.html | 20 +- .../LongformerForSequenceClassification.html | 22 +- .../dl/LongformerForTokenClassification$.html | 20 +- .../dl/LongformerForTokenClassification.html | 22 +- .../dl/MultiClassifierDLApproach.html | 22 +- .../dl/MultiClassifierDLModel$.html | 20 +- .../classifier/dl/MultiClassifierDLModel.html | 22 +- ...ReadAlbertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadAlbertForSequenceDLModel.html | 20 +- .../dl/ReadAlbertForTokenDLModel.html | 20 +- .../ReadBertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadBertForSequenceDLModel.html | 20 +- .../dl/ReadBertForTokenDLModel.html | 20 +- .../dl/ReadBertForZeroShotDLModel.html | 20 +- .../dl/ReadCamemBertForQADLModel.html | 20 +- .../dl/ReadCamemBertForSequenceDLModel.html | 20 +- .../dl/ReadCamemBertForTokenDLModel.html | 20 +- .../dl/ReadClassifierDLTensorflowModel.html | 20 +- ...eadDeBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadDeBertaForSequenceDLModel.html | 20 +- .../dl/ReadDeBertaForTokenDLModel.html | 20 +- ...DistilBertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadDistilBertForSequenceDLModel.html | 20 +- .../dl/ReadDistilBertForTokenDLModel.html | 20 +- .../dl/ReadDistilBertForZeroShotDLModel.html | 20 +- ...LongformerForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadLongformerForSequenceDLModel.html | 20 +- .../dl/ReadLongformerForTokenDLModel.html | 20 +- .../ReadMultiClassifierDLTensorflowModel.html | 20 +- ...eadRoBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadRoBertaForSequenceDLModel.html | 20 +- .../dl/ReadRoBertaForTokenDLModel.html | 20 +- .../dl/ReadRoBertaForZeroShotDLModel.html | 1177 ++++++ .../dl/ReadSentimentDLTensorflowModel.html | 20 +- .../ReadTapasForQuestionAnsweringDLModel.html | 20 +- ...XlmRoBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadXlmRoBertaForSequenceDLModel.html | 20 +- .../dl/ReadXlmRoBertaForTokenDLModel.html | 20 +- .../dl/ReadXlnetForSequenceDLModel.html | 20 +- .../dl/ReadXlnetForTokenDLModel.html | 20 +- .../ReadablePretrainedAlbertForQAModel.html | 20 +- ...dablePretrainedAlbertForSequenceModel.html | 20 +- ...ReadablePretrainedAlbertForTokenModel.html | 20 +- .../dl/ReadablePretrainedBertForQAModel.html | 20 +- ...eadablePretrainedBertForSequenceModel.html | 20 +- .../ReadablePretrainedBertForTokenModel.html | 20 +- ...eadablePretrainedBertForZeroShotModel.html | 20 +- ...ReadablePretrainedCamemBertForQAModel.html | 20 +- ...lePretrainedCamemBertForSequenceModel.html | 20 +- ...dablePretrainedCamemBertForTokenModel.html | 20 +- .../dl/ReadablePretrainedClassifierDL.html | 20 +- .../ReadablePretrainedDeBertaForQAModel.html | 20 +- ...ablePretrainedDeBertaForSequenceModel.html | 20 +- ...eadablePretrainedDeBertaForTokenModel.html | 20 +- ...eadablePretrainedDistilBertForQAModel.html | 20 +- ...ePretrainedDistilBertForSequenceModel.html | 20 +- ...ablePretrainedDistilBertForTokenModel.html | 20 +- ...ePretrainedDistilBertForZeroShotModel.html | 20 +- ...eadablePretrainedLongformerForQAModel.html | 20 +- ...ePretrainedLongformerForSequenceModel.html | 20 +- ...ablePretrainedLongformerForTokenModel.html | 20 +- .../ReadablePretrainedMultiClassifierDL.html | 20 +- .../ReadablePretrainedRoBertaForQAModel.html | 20 +- ...ablePretrainedRoBertaForSequenceModel.html | 20 +- ...eadablePretrainedRoBertaForTokenModel.html | 20 +- ...ablePretrainedRoBertaForZeroShotModel.html | 1237 ++++++ .../dl/ReadablePretrainedSentimentDL.html | 20 +- .../dl/ReadablePretrainedTapasForQAModel.html | 20 +- ...eadablePretrainedXlmRoBertaForQAModel.html | 20 +- ...ePretrainedXlmRoBertaForSequenceModel.html | 20 +- ...ablePretrainedXlmRoBertaForTokenModel.html | 20 +- ...adablePretrainedXlnetForSequenceModel.html | 20 +- .../ReadablePretrainedXlnetForTokenModel.html | 20 +- .../dl/RoBertaForQuestionAnswering$.html | 20 +- .../dl/RoBertaForQuestionAnswering.html | 22 +- .../dl/RoBertaForSequenceClassification$.html | 20 +- .../dl/RoBertaForSequenceClassification.html | 22 +- .../dl/RoBertaForTokenClassification$.html | 20 +- .../dl/RoBertaForTokenClassification.html | 22 +- .../dl/RoBertaForZeroShotClassification$.html | 1358 +++++++ .../dl/RoBertaForZeroShotClassification.html | 3509 +++++++++++++++++ .../classifier/dl/SentimentApproach$.html | 20 +- .../classifier/dl/SentimentDLApproach.html | 22 +- .../classifier/dl/SentimentDLModel$.html | 20 +- .../classifier/dl/SentimentDLModel.html | 22 +- .../dl/TapasForQuestionAnswering$.html | 20 +- .../dl/TapasForQuestionAnswering.html | 22 +- .../dl/XlmRoBertaForQuestionAnswering$.html | 20 +- .../dl/XlmRoBertaForQuestionAnswering.html | 22 +- .../XlmRoBertaForSequenceClassification$.html | 20 +- .../XlmRoBertaForSequenceClassification.html | 22 +- .../dl/XlmRoBertaForTokenClassification$.html | 20 +- .../dl/XlmRoBertaForTokenClassification.html | 22 +- .../dl/XlnetForSequenceClassification$.html | 20 +- .../dl/XlnetForSequenceClassification.html | 22 +- .../dl/XlnetForTokenClassification$.html | 20 +- .../dl/XlnetForTokenClassification.html | 22 +- .../nlp/annotators/classifier/dl/index.html | 133 +- .../nlp/annotators/classifier/index.html | 8 +- .../nlp/annotators/common/Annotated$.html | 8 +- .../nlp/annotators/common/Annotated.html | 8 +- .../nlp/annotators/common/ChunkSplit$.html | 8 +- .../nlp/annotators/common/ConllSentence.html | 8 +- .../DatasetHelpers$$DataFrameHelper.html | 8 +- .../annotators/common/DatasetHelpers$.html | 8 +- .../annotators/common/DependencyParsed$.html | 8 +- .../common/DependencyParsedSentence.html | 8 +- .../common/EmbeddingsWithSentence$.html | 8 +- .../annotators/common/IndexedTaggedWord.html | 8 +- .../nlp/annotators/common/IndexedToken.html | 8 +- .../nlp/annotators/common/InfixToken$.html | 8 +- .../nlp/annotators/common/InfixToken.html | 8 +- .../LabeledDependency$$DependencyInfo.html | 8 +- .../annotators/common/LabeledDependency$.html | 8 +- .../nlp/annotators/common/NerTagged$.html | 8 +- .../nlp/annotators/common/PosTagged$.html | 8 +- .../nlp/annotators/common/PrefixedToken$.html | 8 +- .../nlp/annotators/common/PrefixedToken.html | 8 +- .../common/PreprocessingParser.html | 8 +- .../nlp/annotators/common/Sentence$.html | 8 +- .../nlp/annotators/common/Sentence.html | 8 +- .../nlp/annotators/common/SentenceSplit$.html | 8 +- .../nlp/annotators/common/SuffixedToken$.html | 8 +- .../nlp/annotators/common/SuffixedToken.html | 8 +- .../nlp/annotators/common/TableData$.html | 8 +- .../nlp/annotators/common/TableData.html | 8 +- .../nlp/annotators/common/Tagged.html | 8 +- .../annotators/common/TaggedSentence$.html | 8 +- .../nlp/annotators/common/TaggedSentence.html | 8 +- .../nlp/annotators/common/TaggedWord.html | 8 +- .../nlp/annotators/common/TokenPiece.html | 8 +- .../common/TokenPieceEmbeddings$.html | 8 +- .../common/TokenPieceEmbeddings.html | 8 +- .../annotators/common/TokenizedSentence.html | 8 +- .../common/TokenizedWithSentence$.html | 8 +- .../annotators/common/WordWithDependency.html | 8 +- .../common/WordpieceEmbeddingsSentence$.html | 8 +- .../common/WordpieceEmbeddingsSentence.html | 8 +- .../common/WordpieceTokenized$.html | 8 +- .../common/WordpieceTokenizedSentence.html | 8 +- .../nlp/annotators/common/index.html | 8 +- .../ReadSpanBertCorefTensorflowModel.html | 8 +- .../ReadablePretrainedSpanBertCorefModel.html | 8 +- .../annotators/coref/SpanBertCorefModel$.html | 8 +- .../annotators/coref/SpanBertCorefModel.html | 10 +- .../nlp/annotators/coref/index.html | 8 +- .../cv/ConvNextForImageClassification$.html | 8 +- .../cv/ConvNextForImageClassification.html | 10 +- .../cv/ReadConvNextForImageDLModel.html | 8 +- .../cv/ReadSwinForImageDLModel.html | 8 +- .../annotators/cv/ReadViTForImageDLModel.html | 8 +- ...adablePretrainedConvNextForImageModel.html | 8 +- .../ReadablePretrainedSwinForImageModel.html | 8 +- .../ReadablePretrainedViTForImageModel.html | 8 +- .../cv/SwinForImageClassification$.html | 8 +- .../cv/SwinForImageClassification.html | 10 +- .../cv/ViTForImageClassification$.html | 8 +- .../cv/ViTForImageClassification.html | 10 +- .../johnsnowlabs/nlp/annotators/cv/index.html | 8 +- .../er/AhoCorasickAutomaton$Node.html | 8 +- .../annotators/er/AhoCorasickAutomaton.html | 8 +- .../nlp/annotators/er/EntityPattern.html | 8 +- .../annotators/er/EntityRulerApproach.html | 10 +- .../annotators/er/EntityRulerFeatures.html | 8 +- .../nlp/annotators/er/EntityRulerModel$.html | 8 +- .../nlp/annotators/er/EntityRulerModel.html | 10 +- .../nlp/annotators/er/EntityRulerUtil$.html | 8 +- .../annotators/er/FlattenEntityPattern.html | 8 +- .../nlp/annotators/er/PatternsReadWriter.html | 8 +- .../nlp/annotators/er/PatternsReader.html | 8 +- .../er/ReadablePretrainedEntityRuler.html | 8 +- .../er/RegexPatternsReadWriter.html | 8 +- .../annotators/er/RegexPatternsReader.html | 8 +- .../johnsnowlabs/nlp/annotators/er/index.html | 8 +- .../johnsnowlabs/nlp/annotators/index.html | 8 +- .../nlp/annotators/keyword/index.html | 8 +- .../keyword/yake/YakeKeywordExtraction$.html | 8 +- .../keyword/yake/YakeKeywordExtraction.html | 10 +- .../annotators/keyword/yake/YakeParams.html | 10 +- .../nlp/annotators/keyword/yake/index.html | 8 +- .../annotators/keyword/yake/util/Token.html | 8 +- .../keyword/yake/util/Utilities$.html | 8 +- .../annotators/keyword/yake/util/index.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL$.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL.html | 10 +- ...ReadLanguageDetectorDLTensorflowModel.html | 8 +- ...ablePretrainedLanguageDetectorDLModel.html | 8 +- .../nlp/annotators/ld/dl/index.html | 8 +- .../johnsnowlabs/nlp/annotators/ld/index.html | 8 +- .../nlp/annotators/ner/ModelMetrics$.html | 8 +- .../nlp/annotators/ner/NamedEntity.html | 8 +- .../nlp/annotators/ner/NerApproach.html | 10 +- .../nlp/annotators/ner/NerConverter$.html | 8 +- .../nlp/annotators/ner/NerConverter.html | 10 +- .../nlp/annotators/ner/NerOverwriter$.html | 8 +- .../nlp/annotators/ner/NerOverwriter.html | 10 +- .../nlp/annotators/ner/NerTagsEncoding$.html | 8 +- .../nlp/annotators/ner/Verbose$.html | 8 +- .../ner/crf/DictionaryFeatures$.html | 8 +- .../ner/crf/DictionaryFeatures.html | 8 +- .../ner/crf/FeatureGenerator$TokenType$.html | 8 +- .../annotators/ner/crf/FeatureGenerator.html | 8 +- .../annotators/ner/crf/NerCrfApproach$.html | 8 +- .../annotators/ner/crf/NerCrfApproach.html | 10 +- .../nlp/annotators/ner/crf/NerCrfModel$.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel.html | 10 +- .../ner/crf/ReadablePretrainedNerCrf.html | 8 +- .../nlp/annotators/ner/crf/index.html | 8 +- .../nlp/annotators/ner/dl/LoadsContrib$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach.html | 10 +- .../nlp/annotators/ner/dl/NerDLModel$.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel.html | 10 +- .../ner/dl/NerDLModelPythonReader$.html | 8 +- .../ner/dl/ReadZeroShotNerDLModel.html | 8 +- .../ner/dl/ReadablePretrainedNerDL.html | 8 +- .../ner/dl/ReadablePretrainedZeroShotNer.html | 8 +- .../nlp/annotators/ner/dl/ReadsNERGraph.html | 8 +- .../annotators/ner/dl/WithGraphResolver.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel$.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel.html | 10 +- .../nlp/annotators/ner/dl/index.html | 8 +- .../nlp/annotators/ner/index.html | 8 +- ...lizableFormat$$SerializableDateFormat.html | 8 +- .../AnnotatorParam$SerializableFormat$.html | 8 +- .../nlp/annotators/param/AnnotatorParam.html | 8 +- .../annotators/param/EvaluationDLParams.html | 10 +- .../param/ExternalResourceParam.html | 8 +- .../param/SerializedAnnotatorComponent.html | 8 +- .../param/WritableAnnotatorComponent.html | 8 +- .../nlp/annotators/param/index.html | 8 +- .../parser/dep/DependencyParserApproach$.html | 8 +- .../parser/dep/DependencyParserApproach.html | 10 +- .../parser/dep/DependencyParserModel$.html | 8 +- .../parser/dep/DependencyParserModel.html | 10 +- .../GreedyTransition/DependencyMaker$.html | 8 +- .../DependencyMaker$CurrentState.html | 8 +- .../DependencyMaker$ParseState.html | 8 +- .../dep/GreedyTransition/DependencyMaker.html | 8 +- .../GreedyTransitionApproach$.html | 8 +- .../parser/dep/GreedyTransition/index.html | 8 +- .../GreedyTransition/package$$Feature.html | 8 +- .../GreedyTransition/package$$WordData.html | 8 +- .../parser/dep/Perceptron$WeightLearner.html | 8 +- .../nlp/annotators/parser/dep/Perceptron.html | 8 +- .../dep/ReadablePretrainedDependency.html | 8 +- .../annotators/parser/dep/TagDictionary$.html | 8 +- .../nlp/annotators/parser/dep/Tagger$.html | 8 +- .../nlp/annotators/parser/dep/Tagger.html | 8 +- .../nlp/annotators/parser/dep/index.html | 8 +- .../nlp/annotators/parser/index.html | 8 +- .../annotators/parser/typdep/ConllData.html | 8 +- .../parser/typdep/DependencyArcList.html | 8 +- .../parser/typdep/DependencyInstance.html | 8 +- .../parser/typdep/DependencyPipe.html | 8 +- .../parser/typdep/LocalFeatureData.html | 8 +- .../parser/typdep/LowRankTensor.html | 8 +- .../nlp/annotators/parser/typdep/Options.html | 8 +- .../annotators/parser/typdep/Parameters.html | 8 +- .../parser/typdep/PredictionParameters.html | 8 +- .../ReadablePretrainedTypedDependency.html | 8 +- .../parser/typdep/TrainDependencies.html | 8 +- .../annotators/parser/typdep/TrainFile.html | 8 +- .../parser/typdep/TypedDependencyParser.html | 8 +- .../TypedDependencyParserApproach$.html | 8 +- .../typdep/TypedDependencyParserApproach.html | 10 +- .../typdep/TypedDependencyParserModel$.html | 8 +- .../typdep/TypedDependencyParserModel.html | 10 +- .../typdep/feature/FeatureTemplate.html | 8 +- .../feature/SyntacticFeatureFactory.html | 8 +- .../parser/typdep/feature/index.html | 8 +- .../nlp/annotators/parser/typdep/index.html | 8 +- .../parser/typdep/io/Conll09Reader.html | 8 +- .../parser/typdep/io/ConllUReader.html | 8 +- .../parser/typdep/io/ConllWriter.html | 8 +- .../parser/typdep/io/DependencyReader.html | 8 +- .../annotators/parser/typdep/io/index.html | 8 +- .../parser/typdep/util/Alphabet.html | 8 +- .../parser/typdep/util/Collector.html | 8 +- .../parser/typdep/util/DependencyLabel.html | 8 +- .../parser/typdep/util/Dictionary.html | 8 +- .../parser/typdep/util/DictionarySet.html | 8 +- .../parser/typdep/util/FeatureVector.html | 8 +- .../parser/typdep/util/ScoreCollector.html | 8 +- .../annotators/parser/typdep/util/Utils.html | 8 +- .../annotators/parser/typdep/util/index.html | 8 +- .../nlp/annotators/pos/index.html | 8 +- .../pos/perceptron/AveragedPerceptron.html | 8 +- .../pos/perceptron/PerceptronApproach$.html | 8 +- .../pos/perceptron/PerceptronApproach.html | 10 +- .../PerceptronApproachDistributed$.html | 8 +- .../PerceptronApproachDistributed.html | 10 +- .../pos/perceptron/PerceptronModel$.html | 8 +- .../pos/perceptron/PerceptronModel.html | 10 +- .../perceptron/PerceptronPredictionUtils.html | 8 +- .../perceptron/PerceptronTrainingUtils.html | 8 +- .../pos/perceptron/PerceptronUtils.html | 8 +- .../ReadablePretrainedPerceptron.html | 8 +- .../StringMapStringDoubleAccumulator.html | 8 +- .../perceptron/TrainingPerceptronLegacy.html | 8 +- .../TupleKeyLongDoubleMapAccumulator.html | 8 +- .../nlp/annotators/pos/perceptron/index.html | 8 +- .../sbd/SentenceDetectorParams.html | 10 +- .../nlp/annotators/sbd/index.html | 8 +- .../sbd/pragmatic/CustomPragmaticMethod.html | 8 +- .../sbd/pragmatic/DefaultPragmaticMethod.html | 8 +- .../sbd/pragmatic/MixedPragmaticMethod.html | 8 +- .../pragmatic/PragmaticContentFormatter$.html | 8 +- .../pragmatic/PragmaticContentFormatter.html | 8 +- .../sbd/pragmatic/PragmaticDictionaries$.html | 8 +- .../sbd/pragmatic/PragmaticMethod.html | 8 +- .../pragmatic/PragmaticSentenceExtractor.html | 8 +- .../sbd/pragmatic/PragmaticSymbols$.html | 8 +- .../annotators/sbd/pragmatic/RuleSymbols.html | 8 +- .../sbd/pragmatic/SentenceDetector$.html | 8 +- .../sbd/pragmatic/SentenceDetector.html | 10 +- .../nlp/annotators/sbd/pragmatic/index.html | 8 +- .../nlp/annotators/sda/index.html | 8 +- .../sda/pragmatic/PragmaticScorer.html | 8 +- .../sda/pragmatic/SentimentDetector$.html | 8 +- .../sda/pragmatic/SentimentDetector.html | 10 +- .../pragmatic/SentimentDetectorModel$.html | 8 +- .../sda/pragmatic/SentimentDetectorModel.html | 10 +- .../nlp/annotators/sda/pragmatic/index.html | 8 +- .../sda/vivekn/ReadablePretrainedVivekn.html | 8 +- .../sda/vivekn/ViveknSentimentApproach.html | 10 +- .../sda/vivekn/ViveknSentimentModel$.html | 8 +- .../sda/vivekn/ViveknSentimentModel.html | 10 +- .../sda/vivekn/ViveknSentimentUtils.html | 8 +- .../nlp/annotators/sda/vivekn/index.html | 8 +- .../sentence_detector_dl/Metrics.html | 8 +- .../ReadablePretrainedSentenceDetectorDL.html | 8 +- .../ReadsSentenceDetectorDLGraph.html | 8 +- .../SentenceDetectorDLApproach.html | 10 +- .../SentenceDetectorDLEncoder$.html | 8 +- .../SentenceDetectorDLEncoder.html | 8 +- .../SentenceDetectorDLEncoderParam.html | 8 +- .../SentenceDetectorDLModel$.html | 8 +- .../SentenceDetectorDLModel.html | 10 +- .../sentence_detector_dl/index.html | 8 +- .../annotators/seq2seq/BartTransformer$.html | 14 +- .../annotators/seq2seq/BartTransformer.html | 65 +- .../annotators/seq2seq/GPT2Transformer$.html | 8 +- .../annotators/seq2seq/GPT2Transformer.html | 10 +- .../seq2seq/MarianTransformer$.html | 8 +- .../annotators/seq2seq/MarianTransformer.html | 14 +- .../seq2seq/ReadBartTransformerDLModel.html | 14 +- .../seq2seq/ReadGPT2TransformerDLModel.html | 8 +- .../seq2seq/ReadMarianMTDLModel.html | 8 +- .../seq2seq/ReadT5TransformerDLModel.html | 8 +- ...eadablePretrainedBartTransformerModel.html | 8 +- ...eadablePretrainedGPT2TransformerModel.html | 8 +- .../ReadablePretrainedMarianMTModel.html | 8 +- .../ReadablePretrainedT5TransformerModel.html | 8 +- .../annotators/seq2seq/T5Transformer$.html | 8 +- .../nlp/annotators/seq2seq/T5Transformer.html | 10 +- .../nlp/annotators/seq2seq/index.html | 12 +- .../spell/context/CandidateStrategy$.html | 8 +- ...ntextSpellCheckerApproach$ArrayHelper.html | 8 +- .../context/ContextSpellCheckerApproach.html | 10 +- .../context/ContextSpellCheckerModel$.html | 8 +- .../ContextSpellCheckerModel$StringTools.html | 8 +- .../context/ContextSpellCheckerModel.html | 10 +- .../spell/context/HasTransducerFeatures.html | 8 +- .../spell/context/LangModelSentence.html | 8 +- .../ReadablePretrainedContextSpell.html | 8 +- .../context/ReadsLanguageModelGraph.html | 8 +- .../spell/context/WeightedLevenshtein.html | 8 +- .../nlp/annotators/spell/context/index.html | 8 +- .../spell/context/parser/AgeToken.html | 8 +- .../spell/context/parser/DateToken.html | 8 +- .../context/parser/GenericRegexParser.html | 8 +- .../context/parser/GenericVocabParser.html | 8 +- .../spell/context/parser/LocationClass.html | 8 +- .../spell/context/parser/MainVocab.html | 8 +- .../spell/context/parser/MedicationClass.html | 8 +- .../spell/context/parser/NamesClass.html | 8 +- .../spell/context/parser/NumberToken.html | 8 +- .../spell/context/parser/RegexParser.html | 8 +- .../context/parser/SerializableClass.html | 8 +- .../context/parser/SpecialClassParser.html | 8 +- .../context/parser/TransducerSeqFeature.html | 42 +- .../spell/context/parser/UnitToken.html | 8 +- .../spell/context/parser/VocabParser.html | 8 +- .../spell/context/parser/index.html | 8 +- .../nlp/annotators/spell/index.html | 8 +- .../spell/norvig/NorvigSweetingApproach$.html | 8 +- .../spell/norvig/NorvigSweetingApproach.html | 10 +- .../spell/norvig/NorvigSweetingModel$.html | 8 +- .../spell/norvig/NorvigSweetingModel.html | 10 +- .../spell/norvig/NorvigSweetingParams.html | 10 +- .../norvig/ReadablePretrainedNorvig.html | 8 +- .../nlp/annotators/spell/norvig/index.html | 8 +- .../ReadablePretrainedSymmetric.html | 8 +- .../symmetric/SymmetricDeleteApproach$.html | 8 +- .../symmetric/SymmetricDeleteApproach.html | 10 +- .../symmetric/SymmetricDeleteModel$.html | 8 +- .../SymmetricDeleteModel$SuggestedWord.html | 8 +- .../spell/symmetric/SymmetricDeleteModel.html | 10 +- .../symmetric/SymmetricDeleteParams.html | 10 +- .../nlp/annotators/spell/symmetric/index.html | 8 +- .../nlp/annotators/spell/util/Utilities$.html | 8 +- .../nlp/annotators/spell/util/index.html | 8 +- .../nlp/annotators/tapas/TapasCellDate$.html | 8 +- .../nlp/annotators/tapas/TapasCellDate.html | 8 +- .../nlp/annotators/tapas/TapasCellValue$.html | 8 +- .../nlp/annotators/tapas/TapasCellValue.html | 8 +- .../nlp/annotators/tapas/TapasEncoder.html | 24 +- .../nlp/annotators/tapas/TapasInputData.html | 8 +- .../tapas/TapasNumericRelation$.html | 8 +- .../tapas/TapasNumericValueSpan$.html | 8 +- .../tapas/TapasNumericValueSpan.html | 8 +- .../nlp/annotators/tapas/index.html | 8 +- .../tokenizer/bpe/BartTokenizer.html | 8 +- .../tokenizer/bpe/BpeTokenizer$.html | 8 +- .../tokenizer/bpe/Gpt2Tokenizer.html | 8 +- .../tokenizer/bpe/RobertaTokenizer.html | 8 +- .../tokenizer/bpe/SpecialToken.html | 8 +- .../nlp/annotators/tokenizer/bpe/index.html | 8 +- .../nlp/annotators/tokenizer/index.html | 8 +- .../ws/ReadablePretrainedWordSegmenter.html | 8 +- .../nlp/annotators/ws/TagsType$.html | 8 +- .../annotators/ws/WordSegmenterApproach$.html | 8 +- .../annotators/ws/WordSegmenterApproach.html | 10 +- .../annotators/ws/WordSegmenterModel$.html | 8 +- .../nlp/annotators/ws/WordSegmenterModel.html | 10 +- .../johnsnowlabs/nlp/annotators/ws/index.html | 8 +- .../nlp/embeddings/AlbertEmbeddings$.html | 8 +- .../nlp/embeddings/AlbertEmbeddings.html | 10 +- .../nlp/embeddings/BertEmbeddings$.html | 8 +- .../nlp/embeddings/BertEmbeddings.html | 10 +- .../embeddings/BertSentenceEmbeddings$.html | 8 +- .../embeddings/BertSentenceEmbeddings.html | 10 +- .../nlp/embeddings/CamemBertEmbeddings$.html | 8 +- .../nlp/embeddings/CamemBertEmbeddings.html | 10 +- .../nlp/embeddings/ChunkEmbeddings$.html | 8 +- .../nlp/embeddings/ChunkEmbeddings.html | 10 +- .../nlp/embeddings/DeBertaEmbeddings$.html | 8 +- .../nlp/embeddings/DeBertaEmbeddings.html | 10 +- .../nlp/embeddings/DistilBertEmbeddings$.html | 8 +- .../nlp/embeddings/DistilBertEmbeddings.html | 10 +- .../nlp/embeddings/Doc2VecApproach$.html | 8 +- .../nlp/embeddings/Doc2VecApproach.html | 10 +- .../nlp/embeddings/Doc2VecModel$.html | 8 +- .../nlp/embeddings/Doc2VecModel.html | 10 +- .../nlp/embeddings/ElmoEmbeddings$.html | 8 +- .../nlp/embeddings/ElmoEmbeddings.html | 10 +- .../EmbeddingsCoverage$CoverageResult.html | 8 +- .../nlp/embeddings/EmbeddingsCoverage.html | 8 +- .../embeddings/HasEmbeddingsProperties.html | 10 +- .../nlp/embeddings/LongformerEmbeddings$.html | 8 +- .../nlp/embeddings/LongformerEmbeddings.html | 10 +- .../PoolingStrategy$$AnnotatorType$.html | 8 +- .../nlp/embeddings/PoolingStrategy$.html | 8 +- .../nlp/embeddings/ReadAlbertDLModel.html | 8 +- .../nlp/embeddings/ReadBertDLModel.html | 8 +- .../embeddings/ReadBertSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadCamemBertDLModel.html | 8 +- .../nlp/embeddings/ReadDeBertaDLModel.html | 8 +- .../nlp/embeddings/ReadDistilBertDLModel.html | 8 +- .../nlp/embeddings/ReadElmoDLModel.html | 8 +- .../nlp/embeddings/ReadLongformerDLModel.html | 8 +- .../nlp/embeddings/ReadRobertaDLModel.html | 8 +- .../ReadRobertaSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadUSEDLModel.html | 8 +- .../nlp/embeddings/ReadXlmRobertaDLModel.html | 8 +- .../ReadXlmRobertaSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadXlnetDLModel.html | 8 +- .../ReadablePretrainedAlbertModel.html | 8 +- .../ReadablePretrainedBertModel.html | 8 +- .../ReadablePretrainedBertSentenceModel.html | 8 +- .../ReadablePretrainedCamemBertModel.html | 8 +- .../ReadablePretrainedDeBertaModel.html | 8 +- .../ReadablePretrainedDistilBertModel.html | 8 +- .../embeddings/ReadablePretrainedDoc2Vec.html | 8 +- .../ReadablePretrainedElmoModel.html | 8 +- .../ReadablePretrainedLongformerModel.html | 8 +- .../ReadablePretrainedRobertaModel.html | 8 +- ...eadablePretrainedRobertaSentenceModel.html | 8 +- .../ReadablePretrainedUSEModel.html | 8 +- .../ReadablePretrainedWord2Vec.html | 8 +- .../ReadablePretrainedWordEmbeddings.html | 8 +- .../ReadablePretrainedXlmRobertaModel.html | 8 +- ...ablePretrainedXlmRobertaSentenceModel.html | 8 +- .../ReadablePretrainedXlnetModel.html | 8 +- .../nlp/embeddings/ReadsFromBytes.html | 8 +- .../nlp/embeddings/RoBertaEmbeddings$.html | 8 +- .../nlp/embeddings/RoBertaEmbeddings.html | 10 +- .../RoBertaSentenceEmbeddings$.html | 8 +- .../embeddings/RoBertaSentenceEmbeddings.html | 10 +- .../nlp/embeddings/SentenceEmbeddings$.html | 8 +- .../nlp/embeddings/SentenceEmbeddings.html | 10 +- .../embeddings/UniversalSentenceEncoder$.html | 8 +- .../embeddings/UniversalSentenceEncoder.html | 10 +- .../nlp/embeddings/Word2VecApproach$.html | 8 +- .../nlp/embeddings/Word2VecApproach.html | 10 +- .../nlp/embeddings/Word2VecModel$.html | 8 +- .../nlp/embeddings/Word2VecModel.html | 10 +- .../nlp/embeddings/WordEmbeddings$.html | 8 +- .../nlp/embeddings/WordEmbeddings.html | 10 +- .../WordEmbeddingsBinaryIndexer$.html | 8 +- .../nlp/embeddings/WordEmbeddingsModel$.html | 8 +- .../nlp/embeddings/WordEmbeddingsModel.html | 10 +- .../nlp/embeddings/WordEmbeddingsReader.html | 8 +- .../WordEmbeddingsTextIndexer$.html | 8 +- .../nlp/embeddings/WordEmbeddingsWriter.html | 8 +- .../nlp/embeddings/XlmRoBertaEmbeddings$.html | 8 +- .../nlp/embeddings/XlmRoBertaEmbeddings.html | 10 +- .../XlmRoBertaSentenceEmbeddings$.html | 8 +- .../XlmRoBertaSentenceEmbeddings.html | 10 +- .../nlp/embeddings/XlnetEmbeddings$.html | 8 +- .../nlp/embeddings/XlnetEmbeddings.html | 10 +- .../johnsnowlabs/nlp/embeddings/index.html | 8 +- .../nlp/functions$$EachAnnotations.html | 8 +- .../nlp/functions$$ExplodeAnnotations.html | 8 +- .../nlp/functions$$FilterAnnotations.html | 8 +- .../nlp/functions$$MapAnnotations.html | 8 +- docs/api/com/johnsnowlabs/nlp/functions$.html | 8 +- docs/api/com/johnsnowlabs/nlp/index.html | 8 +- .../nlp/pretrained/PretrainedPipeline$.html | 8 +- .../nlp/pretrained/PretrainedPipeline.html | 8 +- .../pretrained/PythonResourceDownloader$.html | 8 +- .../nlp/pretrained/RepositoryMetadata.html | 8 +- .../nlp/pretrained/ResourceDownloader$.html | 8 +- .../nlp/pretrained/ResourceDownloader.html | 8 +- .../nlp/pretrained/ResourceMetadata$.html | 8 +- .../nlp/pretrained/ResourceMetadata.html | 8 +- .../nlp/pretrained/ResourceRequest.html | 8 +- .../nlp/pretrained/ResourceType$.html | 8 +- .../nlp/pretrained/S3ResourceDownloader.html | 8 +- .../johnsnowlabs/nlp/pretrained/index.html | 8 +- .../com/johnsnowlabs/nlp/recursive/index.html | 8 +- .../nlp/recursive/package$$Recursive.html | 8 +- .../recursive/package$$RecursiveModel.html | 8 +- .../nlp/serialization/ArrayFeature.html | 42 +- .../nlp/serialization/Feature.html | 42 +- .../nlp/serialization/MapFeature.html | 42 +- .../SerializedExternalResource.html | 8 +- .../nlp/serialization/SetFeature.html | 42 +- .../nlp/serialization/StructFeature.html | 42 +- .../nlp/serialization/TransducerFeature.html | 42 +- .../johnsnowlabs/nlp/serialization/index.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLL.html | 8 +- .../nlp/training/CoNLL2003NerReader.html | 8 +- .../nlp/training/CoNLLDocument.html | 8 +- .../CoNLLHelper$$CoNLLSentenceCols.html | 8 +- .../training/CoNLLHelper$$CoNLLTokenCols.html | 8 +- .../nlp/training/CoNLLHelper$.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLLU.html | 8 +- .../nlp/training/CoNLLUCols$.html | 8 +- .../nlp/training/CoNLLUDocument.html | 8 +- .../com/johnsnowlabs/nlp/training/POS.html | 8 +- .../johnsnowlabs/nlp/training/PubTator.html | 8 +- .../nlp/training/SpacyToAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/training/index.html | 8 +- .../johnsnowlabs/nlp/util/FinisherUtil$.html | 8 +- .../johnsnowlabs/nlp/util/GraphBuilder.html | 8 +- .../nlp/util/LfuCache$CachedItem.html | 8 +- .../nlp/util/LfuCache$DoubleLinked.html | 8 +- .../nlp/util/LfuCache$FrequencyList.html | 8 +- .../com/johnsnowlabs/nlp/util/LfuCache.html | 8 +- .../nlp/util/LruMap$KeyPriority.html | 8 +- .../nlp/util/LruMap$KeyPriorityOrdering$.html | 8 +- .../api/com/johnsnowlabs/nlp/util/LruMap.html | 8 +- .../nlp/util/SparkNlpConfigKeys$.html | 8 +- docs/api/com/johnsnowlabs/nlp/util/index.html | 8 +- .../nlp/util/io/ExternalResource$.html | 12 +- .../nlp/util/io/ExternalResource.html | 12 +- .../util/{regex => io}/MatchStrategy$.html | 176 +- .../nlp/util/io/OutputHelper$.html | 12 +- .../com/johnsnowlabs/nlp/util/io/ReadAs$.html | 12 +- .../util/io/ResourceHelper$$SourceStream.html | 8 +- .../nlp/util/io/ResourceHelper$.html | 12 +- .../com/johnsnowlabs/nlp/util/io/index.html | 28 +- .../nlp/util/regex/RegexRule.html | 12 +- .../util/regex/RuleFactory$$RuleMatch.html | 32 +- .../nlp/util/regex/RuleFactory$.html | 20 +- .../nlp/util/regex/RuleFactory.html | 18 +- .../nlp/util/regex/TransformStrategy$.html | 12 +- .../johnsnowlabs/nlp/util/regex/index.html | 30 +- .../com/johnsnowlabs/storage/BytesKey.html | 8 +- .../com/johnsnowlabs/storage/Database$.html | 8 +- .../com/johnsnowlabs/storage/Database.html | 8 +- .../johnsnowlabs/storage/HasConnection.html | 8 +- .../com/johnsnowlabs/storage/HasStorage.html | 10 +- .../johnsnowlabs/storage/HasStorageModel.html | 10 +- .../storage/HasStorageOptions.html | 10 +- .../storage/HasStorageReader.html | 10 +- .../johnsnowlabs/storage/HasStorageRef$.html | 8 +- .../johnsnowlabs/storage/HasStorageRef.html | 10 +- .../storage/RocksDBConnection$.html | 8 +- .../storage/RocksDBConnection.html | 8 +- .../storage/StorageBatchWriter.html | 8 +- .../johnsnowlabs/storage/StorageFormat.html | 8 +- .../johnsnowlabs/storage/StorageHelper$.html | 8 +- .../johnsnowlabs/storage/StorageLocator$.html | 8 +- .../johnsnowlabs/storage/StorageLocator.html | 8 +- .../storage/StorageReadWriter.html | 8 +- .../johnsnowlabs/storage/StorageReadable.html | 8 +- .../johnsnowlabs/storage/StorageReader.html | 8 +- .../johnsnowlabs/storage/StorageWriter.html | 8 +- docs/api/com/johnsnowlabs/storage/index.html | 8 +- .../api/com/johnsnowlabs/util/Benchmark$.html | 8 +- docs/api/com/johnsnowlabs/util/Build$.html | 8 +- .../johnsnowlabs/util/CoNLLGenerator$.html | 8 +- .../com/johnsnowlabs/util/ConfigHelper$.html | 8 +- .../com/johnsnowlabs/util/ConfigLoader$.html | 8 +- .../com/johnsnowlabs/util/FileHelper$.html | 8 +- .../com/johnsnowlabs/util/JsonParser$.html | 8 +- .../johnsnowlabs/util/PipelineModels$.html | 8 +- .../johnsnowlabs/util/TrainingHelper$.html | 8 +- docs/api/com/johnsnowlabs/util/Version$.html | 8 +- docs/api/com/johnsnowlabs/util/Version.html | 8 +- .../johnsnowlabs/util/ZipArchiveUtil$.html | 8 +- docs/api/com/johnsnowlabs/util/index.html | 8 +- .../util/spark/LongMapAccumulator.html | 8 +- .../util/spark/MapAccumulator.html | 8 +- .../johnsnowlabs/util/spark/SparkUtil$.html | 8 +- .../com/johnsnowlabs/util/spark/index.html | 8 +- docs/api/index.html | 8 +- docs/api/index.js | 2 +- docs/api/python/.buildinfo | 2 +- docs/api/python/genindex.html | 128 +- docs/api/python/getting_started/index.html | 20 +- docs/api/python/index.html | 2 +- docs/api/python/modules/index.html | 4 +- docs/api/python/modules/sparknlp.html | 6 +- .../python/modules/sparknlp/annotation.html | 2 +- .../modules/sparknlp/annotation_audio.html | 2 +- .../modules/sparknlp/annotation_image.html | 2 +- .../annotator/audio/hubert_for_ctc.html | 2 +- .../annotator/audio/wav2vec2_for_ctc.html | 2 +- .../sparknlp/annotator/chunk2_doc.html | 2 +- .../modules/sparknlp/annotator/chunker.html | 2 +- .../albert_for_question_answering.html | 18 +- .../albert_for_sequence_classification.html | 20 +- .../albert_for_token_classification.html | 20 +- .../bert_for_question_answering.html | 26 +- .../bert_for_sequence_classification.html | 20 +- .../bert_for_token_classification.html | 20 +- .../bert_for_zero_shot_classification.html | 26 +- .../camembert_for_question_answering.html | 20 +- ...camembert_for_sequence_classification.html | 20 +- .../camembert_for_token_classification.html | 20 +- .../classifier_dl/classifier_dl.html | 2 +- .../deberta_for_question_answering.html | 20 +- .../deberta_for_sequence_classification.html | 21 +- .../deberta_for_token_classification.html | 20 +- .../distil_bert_for_question_answering.html | 20 +- ...stil_bert_for_sequence_classification.html | 20 +- .../distil_bert_for_token_classification.html | 20 +- ...til_bert_for_zero_shot_classification.html | 20 +- .../longformer_for_question_answering.html | 20 +- ...ongformer_for_sequence_classification.html | 20 +- .../longformer_for_token_classification.html | 20 +- .../classifier_dl/multi_classifier_dl.html | 2 +- ...rta_bert_for_zero_shot_classification.html | 638 +++ .../roberta_for_question_answering.html | 20 +- .../roberta_for_sequence_classification.html | 20 +- .../roberta_for_token_classification.html | 2 +- .../annotator/classifier_dl/sentiment_dl.html | 2 +- .../tapas_for_question_answering.html | 2 +- .../xlm_roberta_for_question_answering.html | 20 +- ...m_roberta_for_sequence_classification.html | 20 +- .../xlm_roberta_for_token_classification.html | 20 +- .../xlnet_for_sequence_classification.html | 20 +- .../xlnet_for_token_classification.html | 20 +- .../annotator/coref/spanbert_coref.html | 20 +- .../cv/convnext_for_image_classification.html | 2 +- .../cv/swin_for_image_classification.html | 2 +- .../cv/vit_for_image_classification.html | 2 +- .../sparknlp/annotator/date2_chunk.html | 2 +- .../dependency/dependency_parser.html | 2 +- .../dependency/typed_dependency_parser.html | 2 +- .../annotator/document_normalizer.html | 2 +- .../embeddings/albert_embeddings.html | 20 +- .../annotator/embeddings/bert_embeddings.html | 20 +- .../embeddings/bert_sentence_embeddings.html | 28 +- .../embeddings/camembert_embeddings.html | 22 +- .../embeddings/chunk_embeddings.html | 2 +- .../embeddings/deberta_embeddings.html | 20 +- .../embeddings/distil_bert_embeddings.html | 20 +- .../annotator/embeddings/doc2vec.html | 2 +- .../annotator/embeddings/elmo_embeddings.html | 2 +- .../embeddings/longformer_embeddings.html | 20 +- .../embeddings/roberta_embeddings.html | 20 +- .../roberta_sentence_embeddings.html | 28 +- .../embeddings/sentence_embeddings.html | 2 +- .../universal_sentence_encoder.html | 2 +- .../annotator/embeddings/word2vec.html | 2 +- .../annotator/embeddings/word_embeddings.html | 2 +- .../embeddings/xlm_roberta_embeddings.html | 20 +- .../xlm_roberta_sentence_embeddings.html | 28 +- .../embeddings/xlnet_embeddings.html | 20 +- .../sparknlp/annotator/er/entity_ruler.html | 2 +- .../sparknlp/annotator/graph_extraction.html | 2 +- .../yake_keyword_extraction.html | 2 +- .../annotator/ld_dl/language_detector_dl.html | 2 +- .../sparknlp/annotator/lemmatizer.html | 2 +- .../annotator/matcher/big_text_matcher.html | 2 +- .../annotator/matcher/date_matcher.html | 22 +- .../annotator/matcher/multi_date_matcher.html | 2 +- .../annotator/matcher/regex_matcher.html | 2 +- .../annotator/matcher/text_matcher.html | 2 +- .../sparknlp/annotator/n_gram_generator.html | 2 +- .../sparknlp/annotator/ner/ner_approach.html | 2 +- .../sparknlp/annotator/ner/ner_converter.html | 2 +- .../sparknlp/annotator/ner/ner_crf.html | 2 +- .../sparknlp/annotator/ner/ner_dl.html | 2 +- .../annotator/ner/ner_overwriter.html | 2 +- .../annotator/ner/zero_shot_ner_model.html | 2 +- .../sparknlp/annotator/normalizer.html | 2 +- .../annotator/param/classifier_encoder.html | 2 +- .../annotator/param/evaluation_dl_params.html | 2 +- .../sparknlp/annotator/pos/perceptron.html | 2 +- .../annotator/sentence/sentence_detector.html | 2 +- .../sentence/sentence_detector_dl.html | 2 +- .../sentiment/sentiment_detector.html | 2 +- .../annotator/sentiment/vivekn_sentiment.html | 2 +- .../annotator/seq2seq/bart_transformer.html | 28 +- .../annotator/seq2seq/gpt2_transformer.html | 2 +- .../annotator/seq2seq/marian_transformer.html | 12 +- .../annotator/seq2seq/t5_transformer.html | 2 +- .../spell_check/context_spell_checker.html | 2 +- .../spell_check/norvig_sweeting.html | 2 +- .../spell_check/symmetric_delete.html | 2 +- .../modules/sparknlp/annotator/stemmer.html | 2 +- .../annotator/stop_words_cleaner.html | 2 +- .../annotator/tf_ner_dl_graph_builder.html | 2 +- .../annotator/token/chunk_tokenizer.html | 2 +- .../annotator/token/recursive_tokenizer.html | 2 +- .../annotator/token/regex_tokenizer.html | 2 +- .../sparknlp/annotator/token/tokenizer.html | 2 +- .../sparknlp/annotator/ws/word_segmenter.html | 2 +- .../sparknlp/base/audio_assembler.html | 2 +- .../modules/sparknlp/base/doc2_chunk.html | 2 +- .../sparknlp/base/document_assembler.html | 2 +- .../sparknlp/base/embeddings_finisher.html | 2 +- .../modules/sparknlp/base/finisher.html | 2 +- .../modules/sparknlp/base/graph_finisher.html | 2 +- .../sparknlp/base/has_recursive_fit.html | 2 +- .../base/has_recursive_transform.html | 2 +- .../sparknlp/base/image_assembler.html | 2 +- .../modules/sparknlp/base/light_pipeline.html | 2 +- .../base/multi_document_assembler.html | 2 +- .../sparknlp/base/recursive_pipeline.html | 2 +- .../sparknlp/base/table_assembler.html | 2 +- .../modules/sparknlp/base/token2_chunk.html | 2 +- .../sparknlp/base/token_assembler.html | 2 +- .../sparknlp/common/annotator_approach.html | 2 +- .../sparknlp/common/annotator_model.html | 2 +- .../sparknlp/common/annotator_properties.html | 2 +- .../sparknlp/common/match_strategy.html | 449 +++ .../modules/sparknlp/common/properties.html | 43 +- .../modules/sparknlp/common/read_as.html | 2 +- .../common/recursive_annotator_approach.html | 2 +- .../python/modules/sparknlp/common/utils.html | 2 +- .../python/modules/sparknlp/functions.html | 2 +- .../sparknlp/internal/annotator_java_ml.html | 2 +- .../internal/annotator_transformer.html | 2 +- .../internal/extended_java_wrapper.html | 2 +- .../internal/params_getters_setters.html | 2 +- .../modules/sparknlp/internal/recursive.html | 2 +- .../modules/sparknlp/logging/comet.html | 2 +- .../pretrained/pretrained_pipeline.html | 2 +- .../pretrained/resource_downloader.html | 2 +- .../modules/sparknlp/training/conll.html | 2 +- .../modules/sparknlp/training/conllu.html | 2 +- .../python/modules/sparknlp/training/pos.html | 2 +- .../modules/sparknlp/training/pub_tator.html | 2 +- .../training/spacy_to_annotation.html | 2 +- docs/api/python/objects.inv | Bin 12234 -> 12168 bytes docs/api/python/py-modindex.html | 12 +- .../sparknlp/annotation/index.html | 2 +- .../sparknlp/annotation_audio/index.html | 2 +- .../sparknlp/annotation_image/index.html | 2 +- .../annotator/audio/hubert_for_ctc/index.html | 2 +- .../sparknlp/annotator/audio/index.html | 2 +- .../audio/wav2vec2_for_ctc/index.html | 2 +- .../sparknlp/annotator/chunk2_doc/index.html | 3 +- .../sparknlp/annotator/chunker/index.html | 3 +- .../albert_for_question_answering/index.html | 17 +- .../index.html | 18 +- .../index.html | 18 +- .../bert_for_question_answering/index.html | 23 +- .../index.html | 18 +- .../bert_for_token_classification/index.html | 18 +- .../index.html | 30 +- .../index.html | 17 +- .../index.html | 18 +- .../index.html | 18 +- .../classifier_dl/classifier_dl/index.html | 3 +- .../deberta_for_question_answering/index.html | 17 +- .../index.html | 18 +- .../index.html | 18 +- .../index.html | 17 +- .../index.html | 18 +- .../index.html | 18 +- .../index.html | 18 +- .../annotator/classifier_dl/index.html | 4 +- .../index.html | 17 +- .../index.html | 18 +- .../index.html | 18 +- .../multi_classifier_dl/index.html | 3 +- .../index.html | 798 ++++ .../roberta_for_question_answering/index.html | 17 +- .../index.html | 18 +- .../index.html | 3 +- .../classifier_dl/sentiment_dl/index.html | 3 +- .../tapas_for_question_answering/index.html | 2 +- .../index.html | 17 +- .../index.html | 18 +- .../index.html | 18 +- .../index.html | 18 +- .../xlnet_for_token_classification/index.html | 18 +- .../sparknlp/annotator/coref/index.html | 2 +- .../annotator/coref/spanbert_coref/index.html | 17 +- .../index.html | 2 +- .../sparknlp/annotator/cv/index.html | 2 +- .../swin_for_image_classification/index.html | 2 +- .../vit_for_image_classification/index.html | 2 +- .../sparknlp/annotator/date2_chunk/index.html | 3 +- .../dependency/dependency_parser/index.html | 3 +- .../sparknlp/annotator/dependency/index.html | 3 +- .../typed_dependency_parser/index.html | 3 +- .../annotator/document_normalizer/index.html | 3 +- .../embeddings/albert_embeddings/index.html | 18 +- .../embeddings/bert_embeddings/index.html | 18 +- .../bert_sentence_embeddings/index.html | 18 +- .../camembert_embeddings/index.html | 18 +- .../embeddings/chunk_embeddings/index.html | 3 +- .../embeddings/deberta_embeddings/index.html | 18 +- .../distil_bert_embeddings/index.html | 18 +- .../annotator/embeddings/doc2vec/index.html | 3 +- .../embeddings/elmo_embeddings/index.html | 3 +- .../sparknlp/annotator/embeddings/index.html | 3 +- .../longformer_embeddings/index.html | 18 +- .../embeddings/roberta_embeddings/index.html | 18 +- .../roberta_sentence_embeddings/index.html | 18 +- .../embeddings/sentence_embeddings/index.html | 3 +- .../universal_sentence_encoder/index.html | 3 +- .../annotator/embeddings/word2vec/index.html | 3 +- .../embeddings/word_embeddings/index.html | 3 +- .../xlm_roberta_embeddings/index.html | 18 +- .../index.html | 18 +- .../embeddings/xlnet_embeddings/index.html | 18 +- .../annotator/er/entity_ruler/index.html | 3 +- .../sparknlp/annotator/er/index.html | 3 +- .../annotator/graph_extraction/index.html | 3 +- .../autosummary/sparknlp/annotator/index.html | 4 +- .../annotator/keyword_extraction/index.html | 3 +- .../yake_keyword_extraction/index.html | 3 +- .../sparknlp/annotator/ld_dl/index.html | 3 +- .../ld_dl/language_detector_dl/index.html | 3 +- .../sparknlp/annotator/lemmatizer/index.html | 3 +- .../matcher/big_text_matcher/index.html | 3 +- .../annotator/matcher/date_matcher/index.html | 20 +- .../sparknlp/annotator/matcher/index.html | 3 +- .../matcher/multi_date_matcher/index.html | 3 +- .../matcher/regex_matcher/index.html | 3 +- .../annotator/matcher/text_matcher/index.html | 3 +- .../annotator/n_gram_generator/index.html | 3 +- .../sparknlp/annotator/ner/index.html | 3 +- .../annotator/ner/ner_approach/index.html | 3 +- .../annotator/ner/ner_converter/index.html | 3 +- .../sparknlp/annotator/ner/ner_crf/index.html | 3 +- .../sparknlp/annotator/ner/ner_dl/index.html | 3 +- .../annotator/ner/ner_overwriter/index.html | 3 +- .../ner/zero_shot_ner_model/index.html | 2 +- .../sparknlp/annotator/normalizer/index.html | 3 +- .../param/classifier_encoder/index.html | 2 +- .../param/evaluation_dl_params/index.html | 2 +- .../sparknlp/annotator/param/index.html | 3 +- .../sparknlp/annotator/pos/index.html | 3 +- .../annotator/pos/perceptron/index.html | 3 +- .../sparknlp/annotator/sentence/index.html | 3 +- .../sentence/sentence_detector/index.html | 3 +- .../sentence/sentence_detector_dl/index.html | 3 +- .../sparknlp/annotator/sentiment/index.html | 3 +- .../sentiment/sentiment_detector/index.html | 3 +- .../sentiment/vivekn_sentiment/index.html | 3 +- .../seq2seq/bart_transformer/index.html | 33 +- .../seq2seq/gpt2_transformer/index.html | 3 +- .../sparknlp/annotator/seq2seq/index.html | 3 +- .../seq2seq/marian_transformer/index.html | 10 +- .../seq2seq/t5_transformer/index.html | 3 +- .../context_spell_checker/index.html | 3 +- .../sparknlp/annotator/spell_check/index.html | 3 +- .../spell_check/norvig_sweeting/index.html | 3 +- .../spell_check/symmetric_delete/index.html | 3 +- .../sparknlp/annotator/stemmer/index.html | 3 +- .../annotator/stop_words_cleaner/index.html | 3 +- .../tf_ner_dl_graph_builder/index.html | 2 +- .../token/chunk_tokenizer/index.html | 3 +- .../sparknlp/annotator/token/index.html | 3 +- .../token/recursive_tokenizer/index.html | 3 +- .../token/regex_tokenizer/index.html | 3 +- .../annotator/token/tokenizer/index.html | 3 +- .../sparknlp/annotator/ws/index.html | 3 +- .../annotator/ws/word_segmenter/index.html | 3 +- .../sparknlp/base/audio_assembler/index.html | 2 +- .../sparknlp/base/doc2_chunk/index.html | 2 +- .../base/document_assembler/index.html | 2 +- .../base/embeddings_finisher/index.html | 2 +- .../sparknlp/base/finisher/index.html | 2 +- .../sparknlp/base/graph_finisher/index.html | 2 +- .../base/has_recursive_fit/index.html | 2 +- .../base/has_recursive_transform/index.html | 2 +- .../sparknlp/base/image_assembler/index.html | 2 +- .../autosummary/sparknlp/base/index.html | 2 +- .../sparknlp/base/light_pipeline/index.html | 2 +- .../base/multi_document_assembler/index.html | 2 +- .../base/recursive_pipeline/index.html | 2 +- .../sparknlp/base/table_assembler/index.html | 2 +- .../sparknlp/base/token2_chunk/index.html | 2 +- .../sparknlp/base/token_assembler/index.html | 2 +- .../common/annotator_approach/index.html | 3 +- .../common/annotator_model/index.html | 3 +- .../common/annotator_properties/index.html | 3 +- .../sparknlp/common/annotator_type/index.html | 2 +- .../common/coverage_result/index.html | 3 +- .../autosummary/sparknlp/common/index.html | 4 +- .../sparknlp/common/match_strategy/index.html | 504 +++ .../sparknlp/common/properties/index.html | 3 +- .../sparknlp/common/read_as/index.html | 3 +- .../recursive_annotator_approach/index.html | 3 +- .../sparknlp/common/storage/index.html | 3 +- .../sparknlp/common/utils/index.html | 3 +- .../autosummary/sparknlp/functions/index.html | 2 +- .../reference/autosummary/sparknlp/index.html | 3 +- .../internal/annotator_java_ml/index.html | 2 +- .../internal/annotator_transformer/index.html | 2 +- .../internal/extended_java_wrapper/index.html | 2 +- .../autosummary/sparknlp/internal/index.html | 2 +- .../params_getters_setters/index.html | 2 +- .../sparknlp/internal/recursive/index.html | 2 +- .../sparknlp/logging/comet/index.html | 2 +- .../autosummary/sparknlp/logging/index.html | 2 +- .../sparknlp/pretrained/index.html | 2 +- .../pretrained/pretrained_pipeline/index.html | 2 +- .../pretrained/resource_downloader/index.html | 2 +- .../sparknlp/pretrained/utils/index.html | 2 +- .../sparknlp/training/conll/index.html | 2 +- .../sparknlp/training/conllu/index.html | 2 +- .../autosummary/sparknlp/training/index.html | 2 +- .../sparknlp/training/pos/index.html | 2 +- .../sparknlp/training/pub_tator/index.html | 2 +- .../training/spacy_to_annotation/index.html | 2 +- .../sparknlp/training/tfgraphs/index.html | 2 +- .../sparknlp/upload_to_hub/index.html | 2 +- .../autosummary/sparknlp/util/index.html | 2 +- docs/api/python/reference/index.html | 2 +- docs/api/python/search.html | 2 +- docs/api/python/searchindex.js | 2 +- .../python/static/documentation_options.js | 2 +- docs/api/python/third_party/Comet.html | 2 +- docs/api/python/third_party/MLflow.html | 2 +- docs/api/python/third_party/index.html | 2 +- docs/api/python/user_guide/annotation.html | 2 +- docs/api/python/user_guide/annotators.html | 2 +- .../python/user_guide/custom_pipelines.html | 2 +- docs/api/python/user_guide/helpers.html | 2 +- docs/api/python/user_guide/index.html | 2 +- .../python/user_guide/light_pipelines.html | 2 +- .../user_guide/pretrained_pipelines.html | 2 +- docs/api/python/user_guide/training.html | 2 +- .../nlp/annotators/audio/Wav2Vec2ForCTC.scala | 3 +- .../dl/AlbertForQuestionAnswering.scala | 3 +- .../dl/AlbertForSequenceClassification.scala | 3 +- .../dl/AlbertForTokenClassification.scala | 3 +- .../dl/BertForQuestionAnswering.scala | 3 +- .../dl/BertForSequenceClassification.scala | 3 +- .../dl/BertForTokenClassification.scala | 3 +- .../dl/BertForZeroShotClassification.scala | 3 +- .../dl/CamemBertForQuestionAnswering.scala | 3 +- .../CamemBertForSequenceClassification.scala | 3 +- .../dl/CamemBertForTokenClassification.scala | 3 +- .../dl/DeBertaForQuestionAnswering.scala | 3 +- .../dl/DeBertaForSequenceClassification.scala | 3 +- .../dl/DeBertaForTokenClassification.scala | 3 +- .../dl/DistilBertForQuestionAnswering.scala | 3 +- .../DistilBertForSequenceClassification.scala | 3 +- .../DistilBertForZeroShotClassification.scala | 3 +- .../dl/LongformerForQuestionAnswering.scala | 3 +- .../LongformerForSequenceClassification.scala | 3 +- .../dl/LongformerForTokenClassification.scala | 3 +- .../dl/RoBertaForQuestionAnswering.scala | 3 +- .../dl/RoBertaForSequenceClassification.scala | 3 +- .../dl/RoBertaForTokenClassification.scala | 3 +- .../dl/XlmRoBertaForQuestionAnswering.scala | 3 +- .../XlmRoBertaForSequenceClassification.scala | 3 +- .../dl/XlmRoBertaForTokenClassification.scala | 3 +- .../dl/XlnetForSequenceClassification.scala | 3 +- .../dl/XlnetForTokenClassification.scala | 3 +- .../annotators/coref/SpanBertCorefModel.scala | 3 +- .../cv/ViTForImageClassification.scala | 3 +- .../annotators/seq2seq/BartTransformer.scala | 3 +- .../seq2seq/MarianTransformer.scala | 3 +- .../annotators/seq2seq/T5Transformer.scala | 3 +- .../nlp/embeddings/AlbertEmbeddings.scala | 3 +- .../nlp/embeddings/BertEmbeddings.scala | 3 +- .../embeddings/BertSentenceEmbeddings.scala | 3 +- .../nlp/embeddings/CamemBertEmbeddings.scala | 3 +- .../nlp/embeddings/DeBertaEmbeddings.scala | 3 +- .../nlp/embeddings/DistilBertEmbeddings.scala | 3 +- .../nlp/embeddings/LongformerEmbeddings.scala | 3 +- .../nlp/embeddings/RoBertaEmbeddings.scala | 3 +- .../RoBertaSentenceEmbeddings.scala | 3 +- .../nlp/embeddings/XlmRoBertaEmbeddings.scala | 3 +- .../XlmRoBertaSentenceEmbeddings.scala | 3 +- .../nlp/embeddings/XlnetEmbeddings.scala | 3 +- 1351 files changed, 31716 insertions(+), 5868 deletions(-) create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedDecoderEncoderAttentionMask$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedDecoderEncoderInputIds$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedDecoderInputCache1$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedDecoderInputCache2$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedDecoderInputIds$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedEncoderOutput$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedLogitsOutput$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedOutPut2$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$CachedOutput1$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$InitCachedOutPut2$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$InitCachedOutput1$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$InitDecoderEncoderAttentionMask$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$InitDecoderEncoderInputIds$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$InitDecoderInputIds$.html create mode 100644 docs/api/com/johnsnowlabs/ml/tensorflow/sign/ModelSignatureConstants$$InitLogitsOutput$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/ReadRoBertaForZeroShotDLModel.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/ReadablePretrainedRoBertaForZeroShotModel.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/RoBertaForZeroShotClassification.html rename docs/api/com/johnsnowlabs/nlp/util/{regex => io}/MatchStrategy$.html (80%) create mode 100644 docs/api/python/modules/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification.html create mode 100644 docs/api/python/modules/sparknlp/common/match_strategy.html create mode 100644 docs/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/roberta_bert_for_zero_shot_classification/index.html create mode 100644 docs/api/python/reference/autosummary/sparknlp/common/match_strategy/index.html diff --git a/docs/api/com/index.html b/docs/api/com/index.html index 3d2cb0e3eb1422..5fec16c283b94a 100644 --- a/docs/api/com/index.html +++ b/docs/api/com/index.html @@ -3,9 +3,9 @@ - Spark NLP 4.4.1 ScalaDoc - com - - + Spark NLP 4.4.2 ScalaDoc - com + + @@ -28,7 +28,7 @@