From 0be91b412f2dbdbcaaa8b8d070a162ed4ee2ae25 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Wed, 2 Aug 2023 08:04:08 -0500 Subject: [PATCH 1/8] SPARKNLP-738 Setting upper and lower bound when computing accuracy (#13901) [skip test] --- src/main/scala/com/johnsnowlabs/ml/ai/SentenceDetectorDL.scala | 1 + .../johnsnowlabs/ml/tensorflow/TensorflowMultiClassifier.scala | 1 + 2 files changed, 2 insertions(+) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/SentenceDetectorDL.scala b/src/main/scala/com/johnsnowlabs/ml/ai/SentenceDetectorDL.scala index 55465c5dfd0d6d..535f4340d3a7df 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/SentenceDetectorDL.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/SentenceDetectorDL.scala @@ -156,6 +156,7 @@ private[johnsnowlabs] class SentenceDetectorDL( } acc /= batches + acc = acc.min(1.0f).max(0.0f) if (validationSplit > 0.0) { val (validationFeatures, validationLabels) = validationDataset.toArray.unzip diff --git a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowMultiClassifier.scala b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowMultiClassifier.scala index d481bfd3e4c109..c7cae824d45881 100644 --- a/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowMultiClassifier.scala +++ b/src/main/scala/com/johnsnowlabs/ml/tensorflow/TensorflowMultiClassifier.scala @@ -141,6 +141,7 @@ private[johnsnowlabs] class TensorflowMultiClassifier( } acc /= (trainSet.length / batchSize) + acc = acc.min(1.0f).max(0.0f) loss /= (trainSet.length / batchSize) val endTime = (System.nanoTime() - time) / 1e9 From d35d8c6baed4d2d1d79db9d671cd693bc585d2d3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 2 Aug 2023 18:16:24 +0500 Subject: [PATCH 2/8] add xlm roberta classifier files (#13902) --- .../annotator/classifier_dl/__init__.py | 1 + ...lm_roberta_for_zero_shot_classification.py | 225 +++++++++ python/sparknlp/internal/__init__.py | 6 + ...berta_for_zero_shot_classification_test.py | 52 ++ .../ml/ai/XlmRoBertaClassification.scala | 81 +++- .../com/johnsnowlabs/nlp/annotator.scala | 7 + .../XlmRoBertaForZeroShotClassification.scala | 447 ++++++++++++++++++ .../nlp/pretrained/ResourceDownloader.scala | 1 + ...rtaForZeroShotClassificationTestSpec.scala | 187 ++++++++ 9 files changed, 1005 insertions(+), 2 deletions(-) create mode 100644 python/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py create mode 100644 python/test/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification_test.py create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XLMRobertaForZeroShotClassificationTestSpec.scala diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py index 816d0328ecf55b..eace3d0619ff4c 100644 --- a/python/sparknlp/annotator/classifier_dl/__init__.py +++ b/python/sparknlp/annotator/classifier_dl/__init__.py @@ -46,3 +46,4 @@ from sparknlp.annotator.classifier_dl.bert_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.distil_bert_for_zero_shot_classification import * from sparknlp.annotator.classifier_dl.roberta_bert_for_zero_shot_classification import * +from sparknlp.annotator.classifier_dl.xlm_roberta_for_zero_shot_classification import * \ No newline at end of file diff --git a/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py new file mode 100644 index 00000000000000..36670c155a2d14 --- /dev/null +++ b/python/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.py @@ -0,0 +1,225 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for XlmRoBertaForZeroShotClassification.""" + +from sparknlp.common import * + + +class XlmRoBertaForZeroShotClassification(AnnotatorModel, + HasCaseSensitiveProperties, + HasBatchedAnnotate, + HasClassifierActivationProperties, + HasCandidateLabelsProperties, + HasEngine): + """XlmRoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language + inference) tasks. Equivalent of `XlmRoBertaForSequenceClassification` models, but these models don't require a hardcoded + number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more + flexible. + + Note that the model will loop through all provided labels. So the more labels you have, the + longer this process will take. + + Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis + pair and passed to the pretrained model. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> sequenceClassifier = XlmRoBertaForZeroShotClassification.pretrained() \\ + ... .setInputCols(["token", "document"]) \\ + ... .setOutputCol("label") + + The default model is ``"xlm_roberta_large_zero_shot_classifier_xnli_anli"``, if no name is + provided. + + For available pretrained models please see the `Models Hub + `__. + + To see which models are compatible and how to import them see + `Import Transformers into Spark NLP 🚀 + `_. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT, TOKEN`` ``CATEGORY`` + ====================== ====================== + + Parameters + ---------- + batchSize + Batch size. Large values allows faster processing but requires more + memory, by default 8 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default + True + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + maxSentenceLength + Max sentence length to process, by default 128 + coalesceSentences + Instead of 1 class per sentence (if inputCols is `sentence`) output 1 + class per document by averaging probabilities in all sentences, by + default False + activation + Whether to calculate logits via Softmax or Sigmoid, by default + `"softmax"`. + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> tokenizer = Tokenizer() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("token") + >>> sequenceClassifier = XlmRoBertaForZeroShotClassification.pretrained() \\ + ... .setInputCols(["token", "document"]) \\ + ... .setOutputCol("label") \\ + ... .setCaseSensitive(True) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... tokenizer, + ... sequenceClassifier + ... ]) + >>> data = spark.createDataFrame([["I loved this movie when I was a child.", "It was pretty boring."]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("label.result").show(truncate=False) + +------+ + |result| + +------+ + |[pos] | + |[neg] | + +------+ + """ + name = "XlmRoBertaForZeroShotClassification" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN] + + outputAnnotatorType = AnnotatorType.CATEGORY + + maxSentenceLength = Param(Params._dummy(), + "maxSentenceLength", + "Max sentence length to process", + typeConverter=TypeConverters.toInt) + + configProtoBytes = Param(Params._dummy(), + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", + TypeConverters.toListInt) + + coalesceSentences = Param(Params._dummy(), "coalesceSentences", + "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.", + TypeConverters.toBoolean) + + def getClasses(self): + """ + Returns labels used to train this model + """ + return self._call_java("getClasses") + + def setConfigProtoBytes(self, b): + """Sets configProto from tensorflow, serialized into byte array. + + Parameters + ---------- + b : List[int] + ConfigProto from tensorflow, serialized into byte array + """ + return self._set(configProtoBytes=b) + + def setMaxSentenceLength(self, value): + """Sets max sentence length to process, by default 128. + + Parameters + ---------- + value : int + Max sentence length to process + """ + return self._set(maxSentenceLength=value) + + def setCoalesceSentences(self, value): + """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging + probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as XlmRoBerta + (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities + for the entire document instead of probabilities per sentence. (Default: true) + + Parameters + ---------- + value : bool + If the output of all sentences will be averaged to one output + """ + return self._set(coalesceSentences=value) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForZeroShotClassification", + java_model=None): + super(XlmRoBertaForZeroShotClassification, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + batchSize=8, + maxSentenceLength=128, + caseSensitive=True, + coalesceSentences=False, + activation="softmax" + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + XlmRoBertaForZeroShotClassification + The restored model + """ + from sparknlp.internal import _XlmRoBertaForZeroShotClassification + jModel = _XlmRoBertaForZeroShotClassification(folder, spark_session._jsparkSession)._java_obj + return XlmRoBertaForZeroShotClassification(java_model=jModel) + + @staticmethod + def pretrained(name="xlm_roberta_large_zero_shot_classifier_xnli_anli", lang="xx", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default + "xlm_roberta_large_zero_shot_classifier_xnli_anli" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + XlmRoBertaForZeroShotClassification + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(XlmRoBertaForZeroShotClassification, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 3de0d91a830188..37b7132d174923 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -536,6 +536,12 @@ def __init__(self, path, jspark): jspark) +class _XlmRoBertaForZeroShotClassification(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_XlmRoBertaForZeroShotClassification, self).__init__( + "com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForZeroShotClassification.loadSavedModel", path, + jspark) + class _InstructorLoader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_InstructorLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings.loadSavedModel", path, jspark) \ No newline at end of file diff --git a/python/test/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification_test.py b/python/test/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification_test.py new file mode 100644 index 00000000000000..375bde465b64e0 --- /dev/null +++ b/python/test/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification_test.py @@ -0,0 +1,52 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class XlmRoBertaForZeroShotClassificationTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.text = "I have a problem with my iphone that needs to be resolved asap!!" + self.inputDataset = self.spark.createDataFrame([[self.text]]) \ + .toDF("text") + + def runTest(self): + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + + tokenizer = Tokenizer().setInputCols("document").setOutputCol("token") + + zero_shot_classifier = XlmRoBertaForZeroShotClassification \ + .pretrained() \ + .setInputCols(["document", "token"]) \ + .setOutputCol("class") \ + .setCandidateLabels(["urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology"]) + + pipeline = Pipeline(stages=[ + document_assembler, + tokenizer, + zero_shot_classifier + ]) + + model = pipeline.fit(self.inputDataset) + model.transform(self.inputDataset).show() + light_pipeline = LightPipeline(model) + annotations_result = light_pipeline.fullAnnotate(self.text) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala index 78da6bf15a2eec..bddf0da0bbd368 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala @@ -20,6 +20,7 @@ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, Sente import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} import org.tensorflow.ndarray.buffer.IntDataBuffer @@ -76,8 +77,20 @@ private[johnsnowlabs] class XlmRoBertaClassification( def tokenizeSeqString( candidateLabels: Seq[String], maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = ??? + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + + val basicTokenizer = new BasicTokenizer(caseSensitive) + val encoder = + new SentencepieceEncoder(spp, caseSensitive, sentencePieceDelimiterId, pieceIdOffset = 1) + val labelsToSentences = candidateLabels.map { s => Sentence(s, 0, s.length - 1, 0) } + + labelsToSentences.map(label => { + val tokens = basicTokenizer.tokenize(label) + val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) + WordpieceTokenizedSentence(wordpieceTokens) + }) + } def tokenizeDocument( docs: Seq[Annotation], maxSeqLength: Int, @@ -223,7 +236,71 @@ private[johnsnowlabs] class XlmRoBertaClassification( batch: Seq[Array[Int]], entailmentId: Int, contradictionId: Int, - activation: String): Array[Array[Float]] = ??? + activation: String): Array[Array[Float]] = { + val tensors = new TensorResources() + + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val batchLength = batch.length + + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex + .foreach { case (sentence, idx) => + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentence) + maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0 else 1)) + val sentenceEndTokenIndex = sentence.indexOf(sentenceEndTokenId) + segmentBuffers + .offset(offset) + .write( + sentence.indices + .map(i => + if (i < sentenceEndTokenIndex) 0 + else if (i == sentenceEndTokenIndex) 1 + else 1) + .toArray) + } + + val session = tensorflowWrapper.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner + + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) + + runner + .feed( + _tfXlmRoBertaSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfXlmRoBertaSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch(_tfXlmRoBertaSignatures + .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) + + val outs = runner.run().asScala + val rawScores = TensorResources.extractFloats(outs.head) + + outs.foreach(_.close()) + tensors.clearSession(outs) + tensors.clearTensors() + + val dim = rawScores.length / batchLength + rawScores + .grouped(dim) + .toArray + } def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { val tensors = new TensorResources() diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index d41a32502c9220..fdf5eb0ea2929d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -705,4 +705,11 @@ package object annotator { object RoBertaForZeroShotClassification extends ReadablePretrainedRoBertaForZeroShotModel with ReadRoBertaForZeroShotDLModel + + type XlmRobertaBertForZeroShotClassification = + com.johnsnowlabs.nlp.annotators.classifier.dl.XlmRoBertaForZeroShotClassification + + object XlmRoBertaForZeroShotClassification + extends ReadablePretrainedXlmRoBertaForZeroShotModel + with ReadXlmRoBertaForZeroShotDLModel } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala new file mode 100644 index 00000000000000..981a3ff73f77fa --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala @@ -0,0 +1,447 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.ml.ai.XlmRoBertaClassification +import com.johnsnowlabs.ml.tensorflow._ +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ + ReadSentencePieceModel, + SentencePieceWrapper, + WriteSentencePieceModel +} +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadSentencePieceAsset, + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.serialization.MapFeature +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param.{BooleanParam, IntArrayParam, IntParam} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** XlmRoBertaForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI + * (natural language inference) tasks. Equivalent of `XlmRoBertaForZeroShotClassification ` + * models, but these models don't require a hardcoded number of potential classes, they can be + * chosen at runtime. It usually means it's slower but it is much more flexible. + * + * Note that the model will loop through all provided labels. So the more labels you have, the + * longer this process will take. + * + * Any combination of sequences and labels can be passed and each combination will be posed as a + * premise/hypothesis pair and passed to the pretrained model. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val sequenceClassifier = XlmRoBertaForZeroShotClassification .pretrained() + * .setInputCols("token", "document") + * .setOutputCol("label") + * }}} + * The default model is `"xlm_roberta_large_zero_shot_classifier_xnli_anli"`, if no name is + * provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?task=Text+Classification Models Hub]]. + * + * To see which models are compatible and how to import them see + * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]]. + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * + * val documentAssembler = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val tokenizer = new Tokenizer() + * .setInputCols("document") + * .setOutputCol("token") + * + * val sequenceClassifier = XlmRoBertaForZeroShotClassification .pretrained() + * .setInputCols("token", "document") + * .setOutputCol("label") + * .setCaseSensitive(true) + * + * val pipeline = new Pipeline().setStages(Array( + * documentAssembler, + * tokenizer, + * sequenceClassifier + * )) + * + * val data = Seq("I loved this movie when I was a child.", "It was pretty boring.").toDF("text") + * val result = pipeline.fit(data).transform(data) + * + * result.select("label.result").show(false) + * +------+ + * |result| + * +------+ + * |[pos] | + * |[neg] | + * +------+ + * }}} + * + * @see + * [[XlmRoBertaForZeroShotClassification]] for sequence-level classification + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based classifiers + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class XlmRoBertaForZeroShotClassification(override val uid: String) + extends AnnotatorModel[XlmRoBertaForZeroShotClassification] + with HasBatchedAnnotate[XlmRoBertaForZeroShotClassification] + with WriteTensorflowModel + with WriteSentencePieceModel + with HasCaseSensitiveProperties + with HasClassifierActivationProperties + with HasEngine + with HasCandidateLabelsProperties { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("XLMROBERTABERT_FOR_ZERO_SHOT_CLASSIFICATION")) + + /** Input Annotator Types: DOCUMENT, TOKEN + * + * @group anno + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT, AnnotatorType.TOKEN) + + /** Output Annotator Types: CATEGORY + * + * @group anno + */ + override val outputAnnotatorType: AnnotatorType = AnnotatorType.CATEGORY + + /** Labels used to decode predicted IDs back to string tags + * + * @group param + */ + val labels: MapFeature[String, Int] = new MapFeature(this, "labels").setProtected() + + /** @group setParam */ + def setLabels(value: Map[String, Int]): this.type = { + if (get(labels).isEmpty) + set(labels, value) + this + } + + /** Returns labels used to train this model */ + def getClasses: Array[String] = { + $$(labels).keys.toArray + } + + /** Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document + * by averaging probabilities in all sentences (Default: `false`). + * + * Due to max sequence length limit in almost all transformer models such as XLM-RoBERTa (512 + * tokens), this parameter helps feeding all the sentences into the model and averaging all the + * probabilities for the entire document instead of probabilities per sentence. + * + * @group param + */ + val coalesceSentences = new BooleanParam( + this, + "coalesceSentences", + "If sets to true the output of all sentences will be averaged to one output instead of one output per sentence. Defaults to false.") + + /** @group setParam */ + def setCoalesceSentences(value: Boolean): this.type = set(coalesceSentences, value) + + /** @group getParam */ + def getCoalesceSentences: Boolean = $(coalesceSentences) + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * `config_proto.SerializeToString()` + * + * @group param + */ + val configProtoBytes = new IntArrayParam( + this, + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()") + + /** @group setParam */ + def setConfigProtoBytes(bytes: Array[Int]): XlmRoBertaForZeroShotClassification.this.type = + set(this.configProtoBytes, bytes) + + /** @group getParam */ + def getConfigProtoBytes: Option[Array[Byte]] = get(this.configProtoBytes).map(_.map(_.toByte)) + + /** Max sentence length to process (Default: `128`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "XLM-RoBERTa models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + set(signatures, value) + this + } + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + private var _model: Option[Broadcast[XlmRoBertaClassification]] = None + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: TensorflowWrapper, + spp: SentencePieceWrapper): XlmRoBertaForZeroShotClassification = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new XlmRoBertaClassification( + tensorflowWrapper, + spp, + configProtoBytes = getConfigProtoBytes, + tags = $$(labels), + signatures = getSignatures))) + } + + this + } + + /** @group getParam */ + def getModelIfNotSet: XlmRoBertaClassification = _model.get.value + + /** Whether to lowercase tokens or not (Default: `true`). + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = { + set(this.caseSensitive, value) + } + + setDefault( + batchSize -> 8, + maxSentenceLength -> 128, + caseSensitive -> true, + coalesceSentences -> false) + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + batchedAnnotations.map(annotations => { + val sentences = SentenceSplit.unpack(annotations).toArray + val tokenizedSentences = TokenizedWithSentence.unpack(annotations).toArray + + if (tokenizedSentences.nonEmpty) { + getModelIfNotSet.predictSequenceWithZeroShot( + tokenizedSentences, + sentences, + $(candidateLabels), + $(entailmentIdParam), + $(contradictionIdParam), + $(batchSize), + $(maxSentenceLength), + $(caseSensitive), + $(coalesceSentences), + $$(labels), + getActivation) + + } else { + Seq.empty[Annotation] + } + }) + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper, + "_xlmroberta_classification", + XlmRoBertaForZeroShotClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + writeSentencePieceModel( + path, + spark, + getModelIfNotSet.spp, + "_xlmroberta", + XlmRoBertaForZeroShotClassification.sppFile) + } + +} + +trait ReadablePretrainedXlmRoBertaForZeroShotModel + extends ParamsAndFeaturesReadable[XlmRoBertaForZeroShotClassification] + with HasPretrained[XlmRoBertaForZeroShotClassification] { + override val defaultModelName: Some[String] = Some( + "xlm_roberta_large_zero_shot_classifier_xnli_anli") + override val defaultLang: String = "xx" + + /** Java compliant-overrides */ + override def pretrained(): XlmRoBertaForZeroShotClassification = super.pretrained() + + override def pretrained(name: String): XlmRoBertaForZeroShotClassification = + super.pretrained(name) + + override def pretrained(name: String, lang: String): XlmRoBertaForZeroShotClassification = + super.pretrained(name, lang) + + override def pretrained( + name: String, + lang: String, + remoteLoc: String): XlmRoBertaForZeroShotClassification = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadXlmRoBertaForZeroShotDLModel extends ReadTensorflowModel with ReadSentencePieceModel { + this: ParamsAndFeaturesReadable[XlmRoBertaForZeroShotClassification] => + + override val tfFile: String = "xlmroberta_classification_tensorflow" + override val sppFile: String = "xlmroberta_spp" + + def readModel( + instance: XlmRoBertaForZeroShotClassification, + path: String, + spark: SparkSession): Unit = { + + val tf = + readTensorflowModel(path, spark, "_xlmroberta_classification_tf", initAllTables = false) + val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) + instance.setModelIfNotSet(spark, tf, spp) + } + + addReader(readModel) + + def loadSavedModel( + modelPath: String, + spark: SparkSession): XlmRoBertaForZeroShotClassification = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + + val spModel = loadSentencePieceAsset(localModelPath, "sentencepiece.bpe.model") + val labels = loadTextAsset(localModelPath, "labels.txt").zipWithIndex.toMap + + val entailmentIds = labels.filter(x => x._1.toLowerCase().startsWith("entail")).values.toArray + val contradictionIds = + labels.filter(x => x._1.toLowerCase().startsWith("contradict")).values.toArray + + require( + entailmentIds.length == 1 && contradictionIds.length == 1, + s"""This annotator supports classifiers trained on NLI datasets. You must have only at least 2 or maximum 3 labels in your dataset: + + example with 3 labels: 'contradict', 'neutral', 'entailment' + example with 2 labels: 'contradict', 'entailment' + + You can modify assets/labels.txt file to match the above format. + + Current labels: ${labels.keys.mkString(", ")} + """) + + val annotatorModel = new XlmRoBertaForZeroShotClassification() + .setLabels(labels) + .setCandidateLabels(labels.keys.toArray) + + /* set the entailment id */ + annotatorModel.set(annotatorModel.entailmentIdParam, entailmentIds.head) + /* set the contradiction id */ + annotatorModel.set(annotatorModel.contradictionIdParam, contradictionIds.head) + /* set the engine */ + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case TensorFlow.name => + val (wrapper, signatures) = + TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) + + val _signatures = signatures match { + case Some(s) => s + case None => throw new Exception("Cannot load signature definitions from model!") + } + + /** the order of setSignatures is important if we use getSignatures inside + * setModelIfNotSet + */ + annotatorModel + .setSignatures(_signatures) + .setModelIfNotSet(spark, wrapper, spModel) + + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[XlmRoBertaForZeroShotClassification]]. Please refer to that + * class for the documentation. + */ +object XlmRoBertaForZeroShotClassification + extends ReadablePretrainedXlmRoBertaForZeroShotModel + with ReadXlmRoBertaForZeroShotDLModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index f38f9735604fb7..00d389830bb8a5 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -718,6 +718,7 @@ object PythonResourceDownloader { "BertForZeroShotClassification" -> BertForZeroShotClassification, "DistilBertForZeroShotClassification" -> DistilBertForZeroShotClassification, "RoBertaForZeroShotClassification" -> RoBertaForZeroShotClassification, + "XlmRoBertaForZeroShotClassification" -> XlmRoBertaForZeroShotClassification, "InstructorEmbeddings" -> InstructorEmbeddings, "E5Embeddings" -> E5Embeddings) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XLMRobertaForZeroShotClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XLMRobertaForZeroShotClassificationTestSpec.scala new file mode 100644 index 00000000000000..2ecc365cd6b0f9 --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XLMRobertaForZeroShotClassificationTestSpec.scala @@ -0,0 +1,187 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.annotators.classifier.dl + +import com.johnsnowlabs.nlp.annotators.Tokenizer +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.training.CoNLL +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import com.johnsnowlabs.util.Benchmark +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.sql.functions.{col, explode, size} +import org.scalatest.flatspec.AnyFlatSpec + +class XlmRoBertaForZeroShotClassificationTestSpec extends AnyFlatSpec { + + import ResourceHelper.spark.implicits._ + + val candidateLabels = + Array("urgent", "mobile", "travel", "movie", "music", "sport", "weather", "technology") + + "XlmRoBertaForZeroShotClassification" should "correctly load custom model with extracted signatures" taggedAs SlowTest in { + + val ddd = Seq( + "I have a problem with my iphone that needs to be resolved asap!!", + "Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app.", + "I have a phone and I love it!", + "I really want to visit Germany and I am planning to go there next year.", + "Let's watch some movies tonight! I am in the mood for a horror movie.", + "Have you watched the match yesterday? It was a great game!", + "We need to harry up and get to the airport. We are going to miss our flight!") + .toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val tokenClassifier = XlmRoBertaForZeroShotClassification + .pretrained() + .setInputCols(Array("token", "document")) + .setOutputCol("multi_class") + .setCaseSensitive(true) + .setCoalesceSentences(true) + .setCandidateLabels(candidateLabels) + + val pipeline = new Pipeline().setStages(Array(document, tokenizer, tokenClassifier)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + + pipelineDF.select("multi_class").show(20, false) + pipelineDF.select("document.result", "multi_class.result").show(20, false) + pipelineDF + .withColumn("doc_size", size(col("document"))) + .withColumn("label_size", size(col("multi_class"))) + .where(col("doc_size") =!= col("label_size")) + .select("doc_size", "label_size", "document.result", "multi_class.result") + .show(20, false) + + val totalDocs = pipelineDF.select(explode($"document.result")).count.toInt + val totalLabels = pipelineDF.select(explode($"multi_class.result")).count.toInt + + println(s"total tokens: $totalDocs") + println(s"total embeddings: $totalLabels") + + assert(totalDocs == totalLabels) + } + + "XlmRoBertaForZeroShotClassification" should "be saved and loaded correctly" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val ddd = Seq( + "John Lenon was born in London and lived in Paris. My name is Sarah and I live in London", + "Rare Hendrix song draft sells for almost $17,000.", + "EU rejects German call to boycott British lamb .", + "TORONTO 1996-08-21").toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + + val tokenClassifier = XlmRoBertaForZeroShotClassification + .pretrained() + .setInputCols(Array("token", "document")) + .setOutputCol("label") + .setCaseSensitive(true) + .setCoalesceSentences(true) + .setCandidateLabels(candidateLabels) + .setBatchSize(2) + + val pipeline = new Pipeline().setStages(Array(document, tokenizer, tokenClassifier)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + + pipelineDF.select("label.result").show(false) + + Benchmark.time("Time to save XlmRoBertaForZeroShotClassification pipeline model") { + pipelineModel.write.overwrite().save("./tmp_xlmrobertafornli_pipeline") + } + + Benchmark.time("Time to save XlmRoBertaForZeroShotClassification model") { + pipelineModel.stages.last + .asInstanceOf[XlmRoBertaForZeroShotClassification] + .write + .overwrite() + .save("./tmp_xlmrobertafornli_model") + } + + val loadedPipelineModel = PipelineModel.load("./tmp_xlmrobertafornli_pipeline") + loadedPipelineModel.transform(ddd).select("label.result").show(false) + + val loadedSequenceModel = + XlmRoBertaForZeroShotClassification.load("./tmp_xlmrobertafornli_model") + println(loadedSequenceModel.getClasses.mkString("Array(", ", ", ")")) + + } + + "XlmRoBertaForZeroShotClassification" should "benchmark test" taggedAs SlowTest in { + + val conll = CoNLL(explodeSentences = false) + val training_data = + conll + .readDataset(ResourceHelper.spark, "src/test/resources/conll2003/eng.train") + .repartition(12) + + val tokenClassifier = XlmRoBertaForZeroShotClassification + .pretrained() + .setInputCols(Array("token", "sentence")) + .setOutputCol("class") + .setCaseSensitive(true) + .setCoalesceSentences(true) + .setCandidateLabels(candidateLabels) + .setBatchSize(2) + + val pipeline = new Pipeline() + .setStages(Array(tokenClassifier)) + + val pipelineDF = pipeline.fit(training_data).transform(training_data).cache() + Benchmark.time("Time to save pipeline results") { + pipelineDF.write.mode("overwrite").parquet("./tmp_nli_classifier") + } + + pipelineDF.select("class").show(2, false) + pipelineDF.select("sentence.result", "class.result").show(2, false) + + // only works if it's softmax - one lablel per row + pipelineDF + .withColumn("doc_size", size(col("sentence"))) + .withColumn("label_size", size(col("class"))) + .where(col("doc_size") =!= col("label_size")) + .select("doc_size", "label_size", "sentence.result", "class.result") + .show(20, false) + + val totalDocs = pipelineDF.select(explode($"sentence.result")).count.toInt + val totalLabels = pipelineDF.select(explode($"class.result")).count.toInt + + println(s"total docs: $totalDocs") + println(s"total classes: $totalLabels") + + assert(totalDocs == totalLabels) + } + +} From 95217a78a3873da72b56b4d46e18f35e82ebf3b1 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 2 Aug 2023 15:18:28 +0200 Subject: [PATCH 3/8] Add support for ONNX to ALBERT, CamemBERT, and XLM-RoBERTa (#13907) * Add ONNX support to ALBERT embeddings * Add ONNX support to CamemBERT embeddings * Add ONNX support to XLM-RoBERTa * Update the Transformers docs page --- docs/en/transformers.md | 12 +- .../scala/com/johnsnowlabs/ml/ai/Albert.scala | 140 ++++++++++++------ .../com/johnsnowlabs/ml/ai/CamemBert.scala | 128 +++++++++++----- .../com/johnsnowlabs/ml/ai/XlmRoberta.scala | 124 +++++++++++----- .../nlp/embeddings/AlbertEmbeddings.scala | 74 +++++++-- .../nlp/embeddings/CamemBertEmbeddings.scala | 71 +++++++-- .../nlp/embeddings/XlmRoBertaEmbeddings.scala | 73 +++++++-- .../XlmRoBertaSentenceEmbeddings.scala | 15 +- 8 files changed, 457 insertions(+), 180 deletions(-) diff --git a/docs/en/transformers.md b/docs/en/transformers.md index a224eeaa7e7266..d5335d7029a840 100644 --- a/docs/en/transformers.md +++ b/docs/en/transformers.md @@ -48,15 +48,15 @@ We have extended support for `HuggingFace` 🤗 and `TF Hub` exported models s | BertEmbeddings | ✅ | ✅ | ✅ | BERT - Small BERT - ELECTRA | ❎ | | BertSentenceEmbeddings | ✅ | ✅ | ❎ | BERT - Small BERT - ELECTRA | ❎ | | DistilBertEmbeddings | | ✅ | ✅ | DistilBERT | ❎ | -| CamemBertEmbeddings | | ✅ | ❎ | CamemBERT | ❎ | +| CamemBertEmbeddings | | ✅ | ✅ | CamemBERT | ❎ | | RoBertaEmbeddings | | ✅ | ✅ | RoBERTa - DistilRoBERTa | ❎ | | DeBertaEmbeddings | | ✅ | ✅ | DeBERTa-v2 - DeBERTa-v3 | ❎ | | XlmRoBertaEmbeddings | | ✅ | ✅ | XLM-RoBERTa | ❎ | -| AlbertEmbeddings | ✅ | ✅ | ❎ | ALBERT | ❎ | -| XlnetEmbeddings | | ✅ | ❎ | XLNet | ❎ | -| LongformerEmbeddings | | ✅ | ❎ | Longformer | -| ElmoEmbeddings | ❎ | | ❎ | | -| UniversalSentenceEncoder | ❎ | | ❎ | | +| AlbertEmbeddings | ✅ | ✅ | ✅ | ALBERT | ❎ | +| XlnetEmbeddings | | ✅ | ❌ | XLNet | ❎ | +| LongformerEmbeddings | | ✅ | ❌ | Longformer | +| ElmoEmbeddings | ❎ | | ❌ | | +| UniversalSentenceEncoder | ❎ | | ❌ | | | BertForTokenClassification | | ✅ | ❎ | [TFBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForTokenClassification) | | DistilBertForTokenClassification | | ✅ | ❎ | [TFDistilBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForTokenClassification) | | AlbertForTokenClassification | | ✅ | ❎ | [TFAlbertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForTokenClassification) | diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Albert.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Albert.scala index d981d740924825..bd4846945dc4bc 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Albert.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Albert.scala @@ -16,10 +16,13 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import scala.collection.JavaConverters._ @@ -65,7 +68,8 @@ import scala.collection.JavaConverters._ * Configuration for TensorFlow session */ private[johnsnowlabs] class Albert( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], val spp: SentencePieceWrapper, batchSize: Int, configProtoBytes: Option[Array[Byte]] = None, @@ -75,6 +79,11 @@ private[johnsnowlabs] class Albert( val _tfAlbertSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + // keys representing the input and output tensors of the ALBERT model private val SentenceStartTokenId = spp.getSppModel.pieceToId("[CLS]") private val SentenceEndTokenId = spp.getSppModel.pieceToId("[SEP]") @@ -94,47 +103,94 @@ private[johnsnowlabs] class Albert( val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors, segmentTensors) = - PrepareEmbeddings.prepareBatchTensorsWithSegment( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfAlbertSignatures.getOrElse( - ModelSignatureConstants.InputIdsV1.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfAlbertSignatures - .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfAlbertSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIdsV1.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfAlbertSignatures - .getOrElse(ModelSignatureConstants.LastHiddenStateV1.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - segmentTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + + embeddings + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareBatchTensorsWithSegment( + tensors = tensors, + batch = batch, + maxSentenceLength = maxSentenceLength, + batchLength = batchLength) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfAlbertSignatures.getOrElse( + ModelSignatureConstants.InputIdsV1.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfAlbertSignatures + .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), + maskTensors) + .feed( + _tfAlbertSignatures + .getOrElse(ModelSignatureConstants.TokenTypeIdsV1.key, "missing_segment_ids_key"), + segmentTensors) + .fetch( + _tfAlbertSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenStateV1.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBert.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBert.scala index 57dec6873da9cb..eb1d421b70ce0b 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBert.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBert.scala @@ -16,10 +16,13 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import scala.collection.JavaConverters._ @@ -30,14 +33,15 @@ import scala.collection.JavaConverters._ * in 2019. It is a model trained on 138GB of French text. * * @param tensorflowWrapper - * Albert Model wrapper with TensorFlowWrapper + * CamemBERRT Model wrapper with TensorFlowWrapper * @param spp - * Albert SentencePiece model with SentencePieceWrapper + * CamemBERRT SentencePiece model with SentencePieceWrapper * @param configProtoBytes * Configuration for TensorFlow session */ private[johnsnowlabs] class CamemBert( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], val spp: SentencePieceWrapper, configProtoBytes: Option[Array[Byte]] = None, signatures: Option[Map[String, String]] = None) @@ -46,6 +50,11 @@ private[johnsnowlabs] class CamemBert( val _tfCamemBertSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + /** HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated * in the actual # sentencepiece vocabulary (this is the case for '''''' and '''''') * '''NOTUSED": 0''','''"": 1''', '''"NOTUSED": 2''', '''"": 3''' @@ -69,48 +78,89 @@ private[johnsnowlabs] class CamemBert( val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors) = - PrepareEmbeddings.prepareBatchTensors( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength, - sentencePadTokenId = SentencePadTokenId) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), - tokenTensors) - .feed( - _tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .fetch(_tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.LastHiddenStateV1.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + + embeddings + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors, segmentTensors) = + PrepareEmbeddings.prepareBatchTensorsWithSegment( + tensors = tensors, + batch = batch, + maxSentenceLength = maxSentenceLength, + batchLength = batchLength) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfCamemBertSignatures.getOrElse( + ModelSignatureConstants.InputIdsV1.key, + "missing_input_id_key"), + tokenTensors) + .feed( + _tfCamemBertSignatures + .getOrElse(ModelSignatureConstants.AttentionMaskV1.key, "missing_input_mask_key"), + maskTensors) + .fetch( + _tfCamemBertSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenStateV1.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, embeddings, maxSentenceLength, batchLength) + } def predict( diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala index fd42bfe6665f35..18448b1de90e90 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoberta.scala @@ -16,11 +16,13 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.ModelArch +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} @@ -71,7 +73,8 @@ import scala.collection.JavaConverters._ * Model's inputs and output(s) signatures */ private[johnsnowlabs] class XlmRoberta( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], val spp: SentencePieceWrapper, caseSensitive: Boolean = true, configProtoBytes: Option[Array[Byte]] = None, @@ -81,6 +84,10 @@ private[johnsnowlabs] class XlmRoberta( val _tfRoBertaSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name private val SentenceStartTokenId = 0 private val SentenceEndTokenId = 2 @@ -104,42 +111,81 @@ private[johnsnowlabs] class XlmRoberta( val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max val batchLength = batch.length - val tensors = new TensorResources() - - val (tokenTensors, maskTensors) = - PrepareEmbeddings.prepareBatchTensors( - tensors = tensors, - batch = batch, - maxSentenceLength = maxSentenceLength, - batchLength = batchLength, - sentencePadTokenId = SentencePadTokenId) - - val runner = tensorflowWrapper - .getTFSessionWithSignature( - configProtoBytes = configProtoBytes, - savedSignatures = signatures, - initAllTables = false) - .runner - - runner - .feed( - _tfRoBertaSignatures - .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), - tokenTensors) - .feed( - _tfRoBertaSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .fetch(_tfRoBertaSignatures - .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_sequence_output_key")) - - val outs = runner.run().asScala - val embeddings = TensorResources.extractFloats(outs.head) - - tokenTensors.close() - maskTensors.close() - tensors.clearSession(outs) - tensors.clearTensors() + val embeddings = detectedEngine match { + + case ONNX.name => + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + embeddings + + } finally if (results != null) results.close() + } + case _ => + val tensors = new TensorResources() + + val (tokenTensors, maskTensors) = + PrepareEmbeddings.prepareBatchTensors( + tensors = tensors, + batch = batch, + maxSentenceLength = maxSentenceLength, + batchLength = batchLength, + sentencePadTokenId = SentencePadTokenId) + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + .runner + + runner + .feed( + _tfRoBertaSignatures + .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), + tokenTensors) + .feed( + _tfRoBertaSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch( + _tfRoBertaSignatures + .getOrElse( + ModelSignatureConstants.LastHiddenState.key, + "missing_sequence_output_key")) + + val outs = runner.run().asScala + val embeddings = TensorResources.extractFloats(outs.head) + + tokenTensors.close() + maskTensors.close() + tensors.clearSession(outs) + tensors.clearTensors() + + embeddings + } PrepareEmbeddings.prepareBatchWordEmbeddings( batch, @@ -163,7 +209,7 @@ private[johnsnowlabs] class XlmRoberta( batchLength = batchLength, sentencePadTokenId = SentencePadTokenId) - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature( configProtoBytes = configProtoBytes, savedSignatures = signatures, diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala index c8da89256f2b4c..7bbec2f29e3e1e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AlbertEmbeddings.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.Albert +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -28,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -180,6 +181,7 @@ class AlbertEmbeddings(override val uid: String) with HasBatchedAnnotate[AlbertEmbeddings] with WriteTensorflowModel with WriteSentencePieceModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -266,7 +268,8 @@ class AlbertEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): AlbertEmbeddings = { if (_model.isEmpty) { @@ -274,6 +277,7 @@ class AlbertEmbeddings(override val uid: String) spark.sparkContext.broadcast( new Albert( tensorflowWrapper, + onnxWrapper, spp, batchSize = $(batchSize), configProtoBytes = getConfigProtoBytes, @@ -329,18 +333,34 @@ class AlbertEmbeddings(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_albert", - AlbertEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_albert" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + AlbertEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + AlbertEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } + writeSentencePieceModel( path, spark, getModelIfNotSet.spp, - "_albert", + suffix, AlbertEmbeddings.sppFile) } @@ -370,16 +390,33 @@ trait ReadablePretrainedAlbertModel super.pretrained(name, lang, remoteLoc) } -trait ReadAlbertDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadAlbertDLModel + extends ReadTensorflowModel + with ReadSentencePieceModel + with ReadOnnxModel { this: ParamsAndFeaturesReadable[AlbertEmbeddings] => override val tfFile: String = "albert_tensorflow" + override val onnxFile: String = "albert_onnx" override val sppFile: String = "albert_spp" def readModel(instance: AlbertEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_albert_tf", initAllTables = false) - val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_albert_tf", initAllTables = false) + val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_albert_onnx", zipped = true, useBundle = false, None) + val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -397,7 +434,7 @@ trait ReadAlbertDLModel extends ReadTensorflowModel with ReadSentencePieceModel detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -410,7 +447,12 @@ trait ReadAlbertDLModel extends ReadTensorflowModel with ReadSentencePieceModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala index 914d9b87b91449..f59d0d46c0fa41 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/CamemBertEmbeddings.scala @@ -1,6 +1,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.CamemBert +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -12,7 +13,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -139,6 +140,7 @@ class CamemBertEmbeddings(override val uid: String) with HasBatchedAnnotate[CamemBertEmbeddings] with WriteTensorflowModel with WriteSentencePieceModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -203,13 +205,15 @@ class CamemBertEmbeddings(override val uid: String) def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): CamemBertEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new CamemBert( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, signatures = getSignatures))) @@ -298,20 +302,34 @@ class CamemBertEmbeddings(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) + val suffix = "_camembert" - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_camembert", - CamemBertEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + CamemBertEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + CamemBertEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } writeSentencePieceModel( path, spark, getModelIfNotSet.spp, - "_camembert", + suffix, CamemBertEmbeddings.sppFile) } @@ -335,17 +353,33 @@ trait ReadablePretrainedCamemBertModel super.pretrained(name, lang, remoteLoc) } -trait ReadCamemBertDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadCamemBertDLModel + extends ReadTensorflowModel + with ReadSentencePieceModel + with ReadOnnxModel { this: ParamsAndFeaturesReadable[CamemBertEmbeddings] => override val tfFile: String = "camembert_tensorflow" + override val onnxFile: String = "camembert_onnx" override val sppFile: String = "camembert_spp" def readModel(instance: CamemBertEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_camembert_tf", initAllTables = false) - val spp = readSentencePieceModel(path, spark, "_camembert_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_camembert_tf", initAllTables = false) + val spp = readSentencePieceModel(path, spark, "_camembert_spp", sppFile) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_albert_onnx", zipped = true, useBundle = false, None) + val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -363,7 +397,7 @@ trait ReadCamemBertDLModel extends ReadTensorflowModel with ReadSentencePieceMod detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -376,7 +410,12 @@ trait ReadCamemBertDLModel extends ReadTensorflowModel with ReadSentencePieceMod */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala index 107da32535a946..2d59b18fdb3292 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaEmbeddings.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.XlmRoberta +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -28,7 +29,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelArch, TensorFlow} +import com.johnsnowlabs.ml.util.{ModelArch, ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -167,6 +168,7 @@ class XlmRoBertaEmbeddings(override val uid: String) with HasBatchedAnnotate[XlmRoBertaEmbeddings] with WriteTensorflowModel with WriteSentencePieceModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -235,13 +237,15 @@ class XlmRoBertaEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): XlmRoBertaEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new XlmRoberta( tensorflowWrapper, + onnxWrapper, spp, $(caseSensitive), configProtoBytes = getConfigProtoBytes, @@ -332,18 +336,34 @@ class XlmRoBertaEmbeddings(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_xlmroberta", - XlmRoBertaEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_xlmroberta" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + XlmRoBertaEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + XlmRoBertaEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } + writeSentencePieceModel( path, spark, getModelIfNotSet.spp, - "_xlmroberta", + suffix, XlmRoBertaEmbeddings.sppFile) } @@ -367,17 +387,33 @@ trait ReadablePretrainedXlmRobertaModel super.pretrained(name, lang, remoteLoc) } -trait ReadXlmRobertaDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadXlmRobertaDLModel + extends ReadTensorflowModel + with ReadSentencePieceModel + with ReadOnnxModel { this: ParamsAndFeaturesReadable[XlmRoBertaEmbeddings] => override val tfFile: String = "xlmroberta_tensorflow" + override val onnxFile: String = "xlmroberta_onnx" override val sppFile: String = "xlmroberta_spp" def readModel(instance: XlmRoBertaEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_xlmroberta_tf", initAllTables = false) - val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_xlmroberta_tf", initAllTables = false) + val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + + case ONNX.name => { + val onnxWrapper = + readOnnxModel(path, spark, "_xlmroberta_onnx", zipped = true, useBundle = false, None) + val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + } + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -395,7 +431,7 @@ trait ReadXlmRobertaDLModel extends ReadTensorflowModel with ReadSentencePieceMo detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -408,7 +444,12 @@ trait ReadXlmRobertaDLModel extends ReadTensorflowModel with ReadSentencePieceMo */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala index 07df2844768290..2f25f0adcdd8d6 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/XlmRoBertaSentenceEmbeddings.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.XlmRoberta +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -232,13 +233,15 @@ class XlmRoBertaSentenceEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): XlmRoBertaSentenceEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new XlmRoberta( tensorflowWrapper, + onnxWrapper, spp, $(caseSensitive), configProtoBytes = getConfigProtoBytes, @@ -319,7 +322,7 @@ class XlmRoBertaSentenceEmbeddings(override val uid: String) writeTensorflowModelV2( path, spark, - getModelIfNotSet.tensorflowWrapper, + getModelIfNotSet.tensorflowWrapper.get, "_xlmroberta", XlmRoBertaSentenceEmbeddings.tfFile, configProtoBytes = getConfigProtoBytes) @@ -364,9 +367,9 @@ trait ReadXlmRobertaSentenceDLModel extends ReadTensorflowModel with ReadSentenc path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_xlmroberta_tf", initAllTables = false) + val tfWrapper = readTensorflowModel(path, spark, "_xlmroberta_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) } addReader(readModel) @@ -384,7 +387,7 @@ trait ReadXlmRobertaSentenceDLModel extends ReadTensorflowModel with ReadSentenc detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -397,7 +400,7 @@ trait ReadXlmRobertaSentenceDLModel extends ReadTensorflowModel with ReadSentenc */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) case _ => throw new Exception(notSupportedEngineError) From 6fae7b8867a2ae74ab0c481072c45443d6397305 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Wed, 2 Aug 2023 08:18:53 -0500 Subject: [PATCH 4/8] SPARKNLP-873 Handling vocabulary type from Python side (#13908) --- .../nlp/annotators/seq2seq/MarianTransformer.scala | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala index ce18cf3ad4f8bd..2b9a1f9fe9b5e3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MarianTransformer.scala @@ -37,6 +37,8 @@ import org.apache.spark.ml.param.{IntArrayParam, IntParam, Param, StringArrayPar import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.SparkSession +import scala.jdk.CollectionConverters.asScalaBufferConverter + /** MarianTransformer: Fast Neural Machine Translation * * Marian is an efficient, free Neural Machine Translation framework written in pure C++ with @@ -317,6 +319,13 @@ class MarianTransformer(override val uid: String) /** @group setParam * */ def getModelIfNotSet: Marian = _model.get.value + def getVocabulary: Array[String] = { + if ($(vocabulary).isInstanceOf[java.util.ArrayList[String]]) { + val arrayListValue = $(vocabulary).asInstanceOf[java.util.ArrayList[String]] + arrayListValue.asScala.toArray + } else $(vocabulary) + } + setDefault( maxInputLength -> 40, maxOutputLength -> 40, @@ -349,7 +358,7 @@ class MarianTransformer(override val uid: String) sentences = allAnnotations.map(_._1), maxInputLength = $(maxInputLength), maxOutputLength = $(maxOutputLength), - vocabs = $(vocabulary), + vocabs = getVocabulary, langId = $(langId), batchSize = $(batchSize), ignoreTokenIds = $(ignoreTokenIds)) @@ -441,7 +450,6 @@ trait ReadMarianMTDLModel extends ReadTensorflowModel with ReadSentencePieceMode addReader(readModel) def loadSavedModel(modelPath: String, spark: SparkSession): MarianTransformer = { - val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) val sppSrc = loadSentencePieceAsset(localModelPath, "source.spm") From 18cbb662bc9a94900d69815bf4c9e1da1bac78c0 Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Wed, 2 Aug 2023 23:19:57 +1000 Subject: [PATCH 5/8] Bug fixed #13898 (#13911) --- .../scala/com/johnsnowlabs/ml/ai/Bart.scala | 6 +- .../ml/ai/util/Generation/Generate.scala | 56 ++++++++++------- .../nlp/annotators/seq2seq/BartTestSpec.scala | 62 +++++++++++++++++++ 3 files changed, 99 insertions(+), 25 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala index 3edba90c0f2bd1..f592d16ca8f442 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/Bart.scala @@ -126,7 +126,7 @@ private[johnsnowlabs] class Bart( } var sentBegin, nextSentEnd = 0 - batchDecoder.zip(sentences).map { case (content, sent) => + val annotations = batchDecoder.zip(sentences).map { case (content, sent) => nextSentEnd += content.length - 1 val annots = new Annotation( annotatorType = AnnotatorType.DOCUMENT, @@ -137,6 +137,10 @@ private[johnsnowlabs] class Bart( sentBegin += nextSentEnd + 1 annots } + tensorDecoder = new TensorResources() + nextStateTensor1 = None + nextStateTensor2 = None + annotations } /** @param batch diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala index ef1328233454e2..0168e700caa581 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala @@ -367,37 +367,45 @@ trait Generate { def multinomialSampling(logitValues: Array[Float], k: Int, seed: Option[Long]): Array[Int] = { val (distFiltered, indices) = logitValues.zipWithIndex.filter { case (elem, index) => !elem.isInfinite }.sorted.unzip + if (!distFiltered.isEmpty) { - val maxLogit = distFiltered.max - val expLogitValues = distFiltered.map(logit => math.exp(logit - maxLogit)) - val sumExpLogitValues = expLogitValues.sum - val probabilities = expLogitValues.map(_ / sumExpLogitValues) + val maxLogit = distFiltered.max + val expLogitValues = distFiltered.map(logit => math.exp(logit - maxLogit)) + val sumExpLogitValues = expLogitValues.sum + val probabilities = expLogitValues.map(_ / sumExpLogitValues) - val selectedIndices = new Array[Int](k) - var seededRandom = new scala.util.Random() - if (seed.isDefined) { - seededRandom = new scala.util.Random(seed.get) - } - for (i <- 0 until k) { - var rand = scala.util.Random.nextDouble() + val selectedIndices = new Array[Int](k) + var seededRandom = new scala.util.Random() if (seed.isDefined) { - rand = new scala.util.Random(seed.get).nextDouble() + seededRandom = new scala.util.Random(seed.get) } - var cumProb = 0.0 - var j = 0 - while (j < probabilities.length - i) { - cumProb += probabilities(j) - if (rand < cumProb) { - selectedIndices(i) = indices(j) - probabilities(j) = 0.0 - indices(j) = indices(indices.length - i - 1) - j = probabilities.length + for (i <- 0 until k) { + var rand = scala.util.Random.nextDouble() + if (seed.isDefined) { + rand = new scala.util.Random(seed.get).nextDouble() + } + var cumProb = 0.0 + var j = 0 + while (j < probabilities.length - i) { + cumProb += probabilities(j) + if (rand < cumProb) { + selectedIndices(i) = indices(j) + probabilities(j) = 0.0 + indices(j) = indices(indices.length - i - 1) + j = probabilities.length + } + j += 1 } - j += 1 } - } - selectedIndices + selectedIndices + } else { + val selectedIndices = new Array[Int](k) + for (i <- 0 until k) { + selectedIndices(i) = 0 + } + selectedIndices + } } def getModelOutput( diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala index 553567d53a4df3..08fd3bf97c2fd7 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/BartTestSpec.scala @@ -274,4 +274,66 @@ class BartTestSpec extends AnyFlatSpec { assert(dataframe1.equals(dataframe2)) } + + "bart-large-cnn" should "run SparkNLP pipeline with doSample=false and later change to true " taggedAs SlowTest in { + val testData = ResourceHelper.spark + .createDataFrame( + Seq( + ( + 1, + "Preheat the oven to 220°C/ fan200°C/gas 7. Trim the lamb fillet of fat and cut into slices the thickness" + + " of a chop. Cut the kidneys in half and snip out the white core. Melt a knob of dripping or 2 tablespoons " + + "of vegetable oil in a heavy large pan. Fry the lamb fillet in batches for 3-4 minutes, turning once, until " + + "browned. Set aside. Fry the kidneys and cook for 1-2 minutes, turning once, until browned. Set aside." + + "Wipe the pan with kitchen paper, then add the butter. Add the onions and fry for about 10 minutes until " + + "softened. Sprinkle in the flour and stir well for 1 minute. Gradually pour in the stock, stirring all the " + + "time to avoid lumps. Add the herbs. Stir the lamb and kidneys into the onions. Season well. Transfer to a" + + " large 2.5-litre casserole. Slice the peeled potatoes thinly and arrange on top in overlapping rows. Brush " + + "with melted butter and season. Cover and bake for 30 minutes. Reduce the oven temperature to 160°C" + + "/fan140°C/gas 3 and cook for a further 2 hours. Then increase the oven temperature to 200°C/ fan180°C/gas 6," + + " uncover, and brush the potatoes with more butter. Cook uncovered for 15-20 minutes, or until golden."))) + .toDF("id", "text") + + val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("documents") + + val bart = BartTransformer + .pretrained("distilbart_xsum_12_6") + .setTask("summarize:") + .setInputCols(Array("documents")) + .setDoSample(false) + .setRandomSeed(56) + .setMaxOutputLength(128) + .setTemperature(0.1) + .setOutputCol("summaries") + + val pipeline = new Pipeline().setStages(Array(documentAssembler, bart)) + + val model = pipeline.fit(testData) + + var dataframe1 = model + .transform(testData) + .select("summaries.result") + .collect() + .toSeq + .head + .getAs[Seq[String]](0) + .head + println(dataframe1) + + bart.setDoSample(true) + + dataframe1 = model + .transform(testData) + .select("summaries.result") + .collect() + .toSeq + .head + .getAs[Seq[String]](0) + .head + println(dataframe1) + + } + } From 2c1d691e3e4c8da4e5bc92e7dce616ce4554edd3 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 2 Aug 2023 16:36:33 +0200 Subject: [PATCH 6/8] Update docs, CHANGELOG, fix imports [run doc] --- CHANGELOG | 33 +++++-- README.md | 88 +++++++++---------- build.sbt | 2 +- docs/_layouts/landing.html | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 54 ++++++------ docs/en/spark_nlp.md | 2 +- python/README.md | 88 +++++++++---------- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../com/johnsnowlabs/nlp/annotator.scala | 9 ++ .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 19 files changed, 165 insertions(+), 139 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 39e902cfcf7b85..6f4a245d27b415 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,20 @@ +======== +5.0.2 +======== +---------------- +New Features & Enhancements +---------------- +* **NEW:** Introducing support for ONNX Runtime in ALBERT, CamemBERT, and XLM-RoBERTa annotators +* **NEW:** Implement ZeroShotNerModel annotator for zero-shot NER based on XLM-RoBERTa architecture + +---------------- +Bug Fixes +---------------- +* Fix MarianTransformers annotator breaking with `java.lang.ClassCastException` in Python +* Fix out of 0.0/1.0 accuracy in SentenceDetectorDL and MultiClassifierDL annotators +* Fix BART issue with low temperature value that only occurred when there are no non infinite logits satisfying the low temperature and top_k values +* Add missing E5Embeddings and InstructorEmbeddings annotators to `annotators` in Scala for easy all-in-one import + ======== 5.0.1 ======== @@ -39,7 +56,7 @@ New Features & Enhancements ---------------- Bug Fixes ---------------- -* Fix not being able to save models from XXXForSequenceClassitication and XXXForZeroShotClassification annotoators https://github.com/JohnSnowLabs/spark-nlp/pull/13842 +* Fix not being able to save models from XXXForSequenceClassification and XXXForZeroShotClassification annotators https://github.com/JohnSnowLabs/spark-nlp/pull/13842 ======== @@ -48,7 +65,7 @@ Bug Fixes ---------------- New Features & Enhancements ---------------- -* New `multilabel` parameter to swtich from multi-class to multi-label on all Classifiers in Spark NLP: AlbertForSequenceClassification, BertForSequenceClassification, DeBertaForSequenceClassification, DistilBertForSequenceClassification, LongformerForSequenceClassification, RoBertaForSequenceClassification, XlmRoBertaForSequenceClassification, XlnetForSequenceClassification, BertForZeroShotClassification, DistilBertForZeroShotClassification, and RobertaForZeroShotClassification +* New `multilabel` parameter to switch from multi-class to multi-label on all Classifiers in Spark NLP: AlbertForSequenceClassification, BertForSequenceClassification, DeBertaForSequenceClassification, DistilBertForSequenceClassification, LongformerForSequenceClassification, RoBertaForSequenceClassification, XlmRoBertaForSequenceClassification, XlnetForSequenceClassification, BertForZeroShotClassification, DistilBertForZeroShotClassification, and RobertaForZeroShotClassification * Refactor protected Params and Features to avoid unwanted exceptions during runtime https://github.com/JohnSnowLabs/spark-nlp/pull/13797 * Add proper documentation and instructions for ZeroShot classifiers: BertForZeroShotClassification, DistilBertForZeroShotClassification, and RobertaForZeroShotClassification https://github.com/JohnSnowLabs/spark-nlp/pull/13798 * Extend support for downloading models/pipelines directly by given name or S3 path in ResourceDownloader https://github.com/JohnSnowLabs/spark-nlp/pull/13796 @@ -58,7 +75,7 @@ Bug Fixes ---------------- * Fix pretrained pipelines that stopped working since 4.4.2 release on PySpark 3.0 and 3.1 versions (adding 123 new pipelines were added) https://github.com/JohnSnowLabs/spark-nlp/pull/13805 * Fix pretrained pipelines that stopped working since 4.4.2 release on PySpark 3.2 and 3.3 versions (adding 120 new pipelines) https://github.com/JohnSnowLabs/spark-nlp/pull/13811 -* Fix Java compatibility issue caused by SystemUtils dependecy https://github.com/JohnSnowLabs/spark-nlp/pull/13806 +* Fix Java compatibility issue caused by SystemUtils dependency https://github.com/JohnSnowLabs/spark-nlp/pull/13806 ======== @@ -157,7 +174,7 @@ New Features * Implement HubertForCTC annotator for automatic speech recognition * Implement SwinForImageClassification annotator for Image Classification * Introducing CamemBERT for Question Answering annotator -* Implement ZeroShotNerModel annotator for zero-shot NER baed on RoBERTa architecture +* Implement ZeroShotNerModel annotator for zero-shot NER based on RoBERTa architecture * Implement Date2Chunk annotator * Enable params argument in spark_nlp start() function * Allow doc_id reading CoNLL file datasets @@ -198,7 +215,7 @@ Bug Fixes & Enhancements * Fix missing to output embeddings in `.fullAnnotate()` method when `parseEmbeddings` param was set to `True/true` * Fix broken links to the Python API pages, as the generation of the PyDocs was slightly changed in a previous release. This makes the Python APIs accessible from the Annotators and Transformers pages like before * Change default values of `explodeEntities` and `mergeEntities` parameters to `true` -* Better error handling when there are empty paths/relations in `GraphExctraction`annotator. New message will better guide the user on how to configure `GraphExtraction` to output meaningful relationships +* Better error handling when there are empty paths/relations in `GraphExtraction`annotator. New message will better guide the user on how to configure `GraphExtraction` to output meaningful relationships * Removed the duplicated definition of method `setWeightedDistPath` from `ContextSpellCheckerApproach` @@ -367,7 +384,7 @@ Bug Fixes ---------------- * Fix a bug in generating the NerDL graph by using TF v2. The previous graph generated by the `TFGraphBuilder` annotator resulted in an exception when the length of the sequence was 1. This issue has been resolved and the new graphs created by `TFGraphBuilder` won't have this issue anymore (https://github.com/JohnSnowLabs/spark-nlp/pull/12636) * Fix a bug introduced in the 4.0.0 release between Transformer-based Word Embeddings annotators. In the 4.0.0 release, the following annotators were migrated to BatchAnnotate to improve their performance, especially on GPU. However, a bug was introduced in sentence indices which when it is combined with SentenceEmbeddings for Text Classifications tasks (ClassifierDLApproach, SentimentDLApproach, and ClassifierDLApproach) resulted in low accuracy: AlbertEmbeddings, CamemBertEmbeddings, DeBertaEmbeddings, DistilBertEmbeddings, LongformerEmbeddings, RoBertaEmbeddings, XlmRoBertaEmbeddings, and XlnetEmbeddings (https://github.com/JohnSnowLabs/spark-nlp/pull/12641) -* Add support for a list of questions and context in LightPipline. Previously, only one context and question at a time were supported in LightPipeline for Question Answering annotators. We have added support to `fullAnnotate` and `annotate` to receive two lists of questions and contexts (https://github.com/JohnSnowLabs/spark-nlp/pull/12653) +* Add support for a list of questions and context in LightPipeline. Previously, only one context and question at a time were supported in LightPipeline for Question Answering annotators. We have added support to `fullAnnotate` and `annotate` to receive two lists of questions and contexts (https://github.com/JohnSnowLabs/spark-nlp/pull/12653) * Fix division by zero exception in the `GPT2Transformer` annotator when the `setDoSample` param was set to true (https://github.com/JohnSnowLabs/spark-nlp/pull/12661) ======== @@ -437,7 +454,7 @@ New Features & Enhancements * Migrate T5Transformer to TensorFlow v2 architecture with re-uploading all the existing models * Official support for Apple silicon M1 on macOS devices. From Spark NLP 4.0.0 you can use `spark-nlp-m1` package that supports Apple silicon M1 on your macOS machine * Official support for Apache Spark and PySpark 3.2.x on Scala 2.12. Spark NLP by default is shipped for Spark 3.2.x and supports Spark/PySpark 3.0.x and 3.1.x in additions -* Unifying all supported Apache Spark pacakges on Maven into `spark-nlp` for CPU, `spark-nlp-gpu` for GPU, and `spark-nlp-m1` for new Apple silicon M1 on macOS. The need for Apache Spark specific package like `spark-nlp-spark32` has been removed. +* Unifying all supported Apache Spark packages on Maven into `spark-nlp` for CPU, `spark-nlp-gpu` for GPU, and `spark-nlp-m1` for new Apple silicon M1 on macOS. The need for Apache Spark specific package like `spark-nlp-spark32` has been removed. * Adding a new param to sparknlp.start() function in Python and Scala for Apple silicon M1 on macOS (`m1=True`) * Update Colab, Kaggle, and SageMaker scripts * Add new default NerDL graph for xsmall DeBERTa embeddings model (384 dimensions) @@ -467,7 +484,7 @@ Bug Fixes ---------------- * Fix the default pre-trained model for DeBertaForTokenClassification in Scala and Python * Remove a requirement in DocumentNormalizer that consecutive stage processing can produce empty text annotations without breaking the pipeline -* Fix WordSegmenterModel outputing wrong order of tokens. The regex that groups the tagging format was refactored to preserve the order of segmented outputs (tokens) +* Fix WordSegmenterModel outputting wrong order of tokens. The regex that groups the tagging format was refactored to preserve the order of segmented outputs (tokens) * Fix encoding sentences not respecting the max sequence length given by a user in XlmRobertaSentenceEmbeddings * Fix encoding sentences by using SentencePiece to calculate the correct tokens indexing * Fix SentencePiece serialization issue when XlmRoBertaEmbeddings and XlmRoBertaSentenceEmbeddings annotators are used from a Fat JAR on GPU diff --git a/README.md b/README.md index 57c4ce0793a35a..559c7344328303 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.0.1 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.2 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -183,7 +183,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -228,7 +228,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.2* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -267,7 +267,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.2 has been tested and is compatible with the following runtimes: **CPU:** @@ -325,7 +325,7 @@ Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.0.1 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.2 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -369,11 +369,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` The `spark-nlp` has been published to @@ -382,11 +382,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.2 ``` @@ -396,11 +396,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.2 ``` @@ -410,11 +410,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.2 ``` @@ -428,7 +428,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` ## Scala @@ -446,7 +446,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.1 + 5.0.2 ``` @@ -457,7 +457,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.1 + 5.0.2 ``` @@ -468,7 +468,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.1 + 5.0.2 ``` @@ -479,7 +479,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.1 + 5.0.2 ``` @@ -489,28 +489,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.2" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.2" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.2" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.2" ``` Maven @@ -532,7 +532,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.0.1 +pip install spark-nlp==5.0.2 ``` Conda: @@ -561,7 +561,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2") .getOrCreate() ``` @@ -632,7 +632,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -643,7 +643,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.0.1 +pip install spark-nlp==5.0.2 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -671,7 +671,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -688,7 +688,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -715,7 +715,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -738,7 +738,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.2 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -757,9 +757,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.0.1` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.2` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -810,7 +810,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2" } }] ``` @@ -819,7 +819,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.0.1" \ +--name "Spark NLP 5.0.2" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -883,7 +883,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -922,7 +922,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2") .getOrCreate() ``` @@ -936,7 +936,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` **pyspark:** @@ -949,7 +949,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` **Databricks:** @@ -1221,7 +1221,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.1.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.2.jar") .getOrCreate() ``` @@ -1230,7 +1230,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.1.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.2.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index 2fdac1c421cc55..f6a58d47bb443e 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.0.1" +version := "5.0.2" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index 654d6642bf3c8d..4df163fe53e938 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -201,7 +201,7 @@

{{ _section.title }}

{% highlight bash %} # Using PyPI - $ pip install spark-nlp==5.0.1 + $ pip install spark-nlp==5.0.2 # Using Anaconda/Conda $ conda install -c johnsnowlabs spark-nlp diff --git a/docs/en/concepts.md b/docs/en/concepts.md index 07d5c17121d3a7..f3bbe3e596cccd 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -62,7 +62,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index bf097b741e9ff1..8cecd6297588f5 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -16,7 +16,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 ``` ## Google Colab Notebook @@ -36,7 +36,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 5.0.1 +!bash colab.sh -p 3.2.3 -s 5.0.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index 1bc6ada008fdb8..08e67299d61ee0 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 5.0.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.2 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index 78445a811a67a8..1bf0ff1d78a3f5 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -15,22 +15,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==5.0.1 +pip install spark-nlp==5.0.2 # Install Spark NLP from Anacodna/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-5.0.1.jar +spark-shell --jars spark-nlp-assembly-5.0.2.jar ``` ## Python @@ -49,7 +49,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -76,7 +76,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2")\ .getOrCreate() ``` @@ -91,7 +91,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.1 + 5.0.2 ``` @@ -102,7 +102,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.1 + 5.0.2 ``` @@ -113,7 +113,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.1 + 5.0.2 ``` @@ -124,7 +124,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.1 + 5.0.2 ``` @@ -134,28 +134,28 @@ spark = SparkSession.builder \ ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.2" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.2" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.2" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.2" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -233,7 +233,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.1 + 5.0.2 ``` @@ -241,7 +241,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.2" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -274,7 +274,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 5.0.1, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.0.2, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -318,7 +318,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -337,7 +337,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.2 has been tested and is compatible with the following runtimes: **CPU:** @@ -403,7 +403,7 @@ NOTE: Spark NLP 4.0.x is based on TensorFlow 2.7.x which is compatible with CUDA 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -419,7 +419,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 5.0.1 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.2 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -477,7 +477,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2" } } ] @@ -487,7 +487,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 5.0.1" \ +--name "Spark NLP 5.0.2" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -741,7 +741,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2*. @@ -767,12 +767,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.1.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.2.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.1.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.2.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index a5791610509f45..320195e6517931 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 5.0.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.2 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/python/README.md b/python/README.md index 57c4ce0793a35a..559c7344328303 100644 --- a/python/README.md +++ b/python/README.md @@ -167,7 +167,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.0.1 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.0.2 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -183,7 +183,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -228,7 +228,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.0.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.0.2* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -267,7 +267,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: +Spark NLP 5.0.2 has been tested and is compatible with the following runtimes: **CPU:** @@ -325,7 +325,7 @@ Spark NLP 5.0.1 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.0.1 has been tested and is compatible with the following EMR releases: +Spark NLP 5.0.2 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -369,11 +369,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` The `spark-nlp` has been published to @@ -382,11 +382,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.0.2 ``` @@ -396,11 +396,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.0.2 ``` @@ -410,11 +410,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.2 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.2 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.1 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.0.2 ``` @@ -428,7 +428,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` ## Scala @@ -446,7 +446,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.0.1 + 5.0.2 ``` @@ -457,7 +457,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.0.1 + 5.0.2 ``` @@ -468,7 +468,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.0.1 + 5.0.2 ``` @@ -479,7 +479,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.0.1 + 5.0.2 ``` @@ -489,28 +489,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.0.2" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.0.2" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.0.2" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.1" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.0.2" ``` Maven @@ -532,7 +532,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.0.1 +pip install spark-nlp==5.0.2 ``` Conda: @@ -561,7 +561,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2") .getOrCreate() ``` @@ -632,7 +632,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -643,7 +643,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.0.1 +pip install spark-nlp==5.0.2 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -671,7 +671,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.0.1 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.0.2 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -688,7 +688,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -715,7 +715,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.2 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -738,7 +738,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.1 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.0.2 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -757,9 +757,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.0.1` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.0.2` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -810,7 +810,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2" } }] ``` @@ -819,7 +819,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.0.1" \ +--name "Spark NLP 5.0.2" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -883,7 +883,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -922,7 +922,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2") .getOrCreate() ``` @@ -936,7 +936,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` **pyspark:** @@ -949,7 +949,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.1 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.0.2 ``` **Databricks:** @@ -1221,7 +1221,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.1.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.0.2.jar") .getOrCreate() ``` @@ -1230,7 +1230,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.1.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.0.2.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index 0466935bd8fd5c..8ccc48e7d920ea 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "5.0.1" +release = "5.0.2" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index b9a652fc533069..f33027a4cd75c2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.0.1', # Required + version='5.0.2', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index f3a6423ba28940..afb1410fecb89a 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "5.0.1" + current_version = "5.0.2" if params is None: params = {} @@ -309,4 +309,4 @@ def version(): str The current Spark NLP version. """ - return '5.0.1' + return '5.0.2' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index b8b4b3f91c983d..bc8b28a0cd0637 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.1" +SPARKNLP="5.0.2" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index 3b198d009c7250..4c1266ce2a5cee 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.1" +SPARKNLP="5.0.2" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index fb7318a212ac08..fcb2fb59dc7452 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.0.1" +SPARKNLP="5.0.2" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 5746ae9d1e07fc..e5c2f86cc6d95d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "5.0.1" + val currentVersion = "5.0.2" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index fdf5eb0ea2929d..c3ce62c50d7766 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -712,4 +712,13 @@ package object annotator { object XlmRoBertaForZeroShotClassification extends ReadablePretrainedXlmRoBertaForZeroShotModel with ReadXlmRoBertaForZeroShotDLModel + + type E5Embeddings = com.johnsnowlabs.nlp.embeddings.E5Embeddings + + object E5Embeddings extends ReadablePretrainedE5Model with ReadE5DLModel + + type InstructorEmbeddings = com.johnsnowlabs.nlp.embeddings.InstructorEmbeddings + + object InstructorEmbeddings extends ReadablePretrainedInstructorModel with ReadInstructorDLModel + } diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 38c1c2fae724a1..e055573d2de203 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "5.0.1" + val version: String = "5.0.2" } From 66410c26f393e33e027b0ea7a6032b13ca8adecc Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 2 Aug 2023 14:45:57 +0000 Subject: [PATCH 7/8] Update Scala and Python APIs --- docs/api/com/index.html | 8 +- .../johnsnowlabs/client/CredentialParams.html | 8 +- .../client/aws/AWSAnonymousCredentials.html | 8 +- .../client/aws/AWSBasicCredentials.html | 8 +- .../client/aws/AWSCredentialsProvider.html | 8 +- .../johnsnowlabs/client/aws/AWSGateway.html | 8 +- .../client/aws/AWSProfileCredentials.html | 8 +- .../client/aws/AWSTokenCredentials.html | 8 +- .../johnsnowlabs/client/aws/Credentials.html | 8 +- .../com/johnsnowlabs/client/aws/index.html | 8 +- .../johnsnowlabs/client/gcp/GCPGateway.html | 8 +- .../com/johnsnowlabs/client/gcp/index.html | 8 +- docs/api/com/johnsnowlabs/client/index.html | 8 +- .../johnsnowlabs/collections/SearchTrie$.html | 8 +- .../johnsnowlabs/collections/SearchTrie.html | 8 +- .../collections/StorageSearchTrie$.html | 8 +- .../collections/StorageSearchTrie.html | 8 +- .../com/johnsnowlabs/collections/index.html | 8 +- docs/api/com/johnsnowlabs/index.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/DeBerta.html | 8 +- .../ml/ai/MergeTokenStrategy$.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/index.html | 8 +- .../ml/ai/util/Generation/Generate.html | 8 +- .../ml/ai/util/Generation/Logit/Logit.html | 8 +- .../Logit/LogitProcess/LogitProcessor.html | 8 +- .../LogitProcess/MinLengthLogitProcessor.html | 8 +- .../NoRepeatNgramsLogitProcessor.html | 8 +- .../RepetitionPenaltyLogitProcessor.html | 8 +- .../Generation/Logit/LogitProcess/index.html | 8 +- .../Generation/Logit/LogitProcessorList.html | 8 +- .../Logit/LogitWarper/LogitWarper.html | 8 +- .../LogitWarper/TemperatureLogitWarper.html | 8 +- .../Logit/LogitWarper/TopKLogitWarper.html | 8 +- .../Logit/LogitWarper/TopPLogitWarper.html | 8 +- .../Generation/Logit/LogitWarper/index.html | 8 +- .../ml/ai/util/Generation/Logit/index.html | 8 +- .../Generation/Search/BeamHypotheses.html | 8 +- .../ai/util/Generation/Search/BeamScorer.html | 8 +- .../Generation/Search/BeamSearchScorer.html | 8 +- .../ml/ai/util/Generation/Search/index.html | 8 +- .../ml/ai/util/Generation/index.html | 8 +- .../com/johnsnowlabs/ml/ai/util/index.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/Attr.html | 8 +- .../com/johnsnowlabs/ml/crf/AttrFeature.html | 8 +- .../api/com/johnsnowlabs/ml/crf/AttrStat.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfDataset.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfParams.html | 8 +- .../johnsnowlabs/ml/crf/DatasetEncoder.html | 8 +- .../johnsnowlabs/ml/crf/DatasetMetadata.html | 8 +- .../johnsnowlabs/ml/crf/DatasetReader$.html | 8 +- .../johnsnowlabs/ml/crf/EdgeCalculator$.html | 8 +- .../com/johnsnowlabs/ml/crf/FbCalculator.html | 8 +- .../api/com/johnsnowlabs/ml/crf/Instance.html | 8 +- .../johnsnowlabs/ml/crf/InstanceLabels.html | 8 +- .../johnsnowlabs/ml/crf/L2DecayStrategy.html | 8 +- .../johnsnowlabs/ml/crf/LinearChainCrf.html | 8 +- .../ml/crf/LinearChainCrfModel.html | 8 +- .../ml/crf/SerializedDatasetMetadata.html | 8 +- .../ml/crf/SerializedLinearChainCrfModel.html | 8 +- .../ml/crf/SparseArray$$SeqWrapper.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray$.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray.html | 8 +- .../ml/crf/TextSentenceAttrs.html | 8 +- .../ml/crf/TextSentenceLabels.html | 8 +- .../com/johnsnowlabs/ml/crf/Transition.html | 8 +- .../com/johnsnowlabs/ml/crf/VectorMath$.html | 8 +- .../com/johnsnowlabs/ml/crf/WordAttrs.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/index.html | 8 +- docs/api/com/johnsnowlabs/ml/index.html | 8 +- .../johnsnowlabs/ml/onnx/OnnxWrapper$.html | 8 +- .../com/johnsnowlabs/ml/onnx/OnnxWrapper.html | 8 +- .../johnsnowlabs/ml/onnx/ReadOnnxModel.html | 10 +- .../johnsnowlabs/ml/onnx/WriteOnnxModel.html | 10 +- docs/api/com/johnsnowlabs/ml/onnx/index.html | 8 +- .../tensorflow/ClassifierDatasetEncoder.html | 8 +- .../ClassifierDatasetEncoderParams.html | 8 +- .../ml/tensorflow/DatasetEncoderParams.html | 8 +- .../johnsnowlabs/ml/tensorflow/Logging.html | 8 +- .../ml/tensorflow/ModelSignature.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch$.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch.html | 8 +- .../ml/tensorflow/NerDatasetEncoder.html | 8 +- .../ml/tensorflow/ReadTensorflowModel.html | 10 +- .../ml/tensorflow/SentenceGrouper.html | 8 +- .../ml/tensorflow/TensorResources$.html | 8 +- .../ml/tensorflow/TensorResources.html | 8 +- .../ml/tensorflow/TensorflowClassifier.html | 8 +- .../ml/tensorflow/TensorflowWrapper$.html | 8 +- .../ml/tensorflow/TensorflowWrapper.html | 8 +- .../johnsnowlabs/ml/tensorflow/Variables.html | 8 +- .../ml/tensorflow/WriteTensorflowModel.html | 10 +- .../com/johnsnowlabs/ml/tensorflow/index.html | 8 +- .../sentencepiece/ReadSentencePieceModel.html | 10 +- .../sentencepiece/SentencePieceException.html | 8 +- .../sentencepiece/SentencePieceProcessor.html | 8 +- .../sentencepiece/SentencePieceWrapper$.html | 8 +- .../WriteSentencePieceModel.html | 10 +- .../ml/tensorflow/sentencepiece/index.html | 8 +- ...delSignatureConstants$$AttentionMask$.html | 8 +- ...lSignatureConstants$$AttentionMaskV1$.html | 8 +- ...SignatureConstants$$AudioValuesInput$.html | 8 +- ...s$$CachedDecoderEncoderAttentionMask$.html | 8 +- ...stants$$CachedDecoderEncoderInputIds$.html | 8 +- ...eConstants$$CachedDecoderInputCache1$.html | 8 +- ...eConstants$$CachedDecoderInputCache2$.html | 8 +- ...tureConstants$$CachedDecoderInputIds$.html | 8 +- ...natureConstants$$CachedEncoderOutput$.html | 8 +- ...gnatureConstants$$CachedLogitsOutput$.html | 8 +- ...delSignatureConstants$$CachedOutPut2$.html | 8 +- ...delSignatureConstants$$CachedOutput1$.html | 8 +- .../sign/ModelSignatureConstants$$DType$.html | 8 +- ...atureConstants$$DecoderAttentionMask$.html | 8 +- ...nstants$$DecoderEncoderAttentionMask$.html | 8 +- ...ureConstants$$DecoderEncoderInputIds$.html | 8 +- ...lSignatureConstants$$DecoderInputIds$.html | 8 +- ...delSignatureConstants$$DecoderOutput$.html | 8 +- .../ModelSignatureConstants$$DimCount$.html | 8 +- ...atureConstants$$EncoderAttentionMask$.html | 8 +- ...gnatureConstants$$EncoderContextMask$.html | 8 +- ...lSignatureConstants$$EncoderInputIds$.html | 8 +- ...delSignatureConstants$$EncoderOutput$.html | 8 +- ...lSignatureConstants$$EndLogitsOutput$.html | 8 +- ...ignatureConstants$$InitCachedOutPut2$.html | 8 +- ...ignatureConstants$$InitCachedOutput1$.html | 8 +- ...nts$$InitDecoderEncoderAttentionMask$.html | 8 +- ...onstants$$InitDecoderEncoderInputIds$.html | 8 +- ...natureConstants$$InitDecoderInputIds$.html | 8 +- ...SignatureConstants$$InitLogitsOutput$.html | 8 +- .../ModelSignatureConstants$$InputIds$.html | 8 +- .../ModelSignatureConstants$$InputIdsV1$.html | 8 +- ...lSignatureConstants$$LastHiddenState$.html | 8 +- ...ignatureConstants$$LastHiddenStateV1$.html | 8 +- ...odelSignatureConstants$$LogitsOutput$.html | 8 +- .../sign/ModelSignatureConstants$$Name$.html | 8 +- ...SignatureConstants$$PixelValuesInput$.html | 8 +- ...odelSignatureConstants$$PoolerOutput$.html | 8 +- ...elSignatureConstants$$PoolerOutputV1$.html | 8 +- ...elSignatureConstants$$SerializedSize$.html | 8 +- ...odelSignatureConstants$$ShapeDimList$.html | 8 +- ...ignatureConstants$$StartLogitsOutput$.html | 8 +- ...lSignatureConstants$$TFInfoDescriptor.html | 8 +- ...lSignatureConstants$$TFInfoNameMapper.html | 8 +- ...stants$$TapasLogitsAggregationOutput$.html | 8 +- ...ignatureConstants$$TapasLogitsOutput$.html | 8 +- ...odelSignatureConstants$$TokenTypeIds$.html | 8 +- ...elSignatureConstants$$TokenTypeIdsV1$.html | 8 +- .../sign/ModelSignatureConstants$.html | 8 +- .../sign/ModelSignatureManager$.html | 8 +- .../ml/tensorflow/sign/index.html | 8 +- .../ml/util/LoadExternalModel$.html | 8 +- .../com/johnsnowlabs/ml/util/ModelArch$.html | 8 +- .../com/johnsnowlabs/ml/util/ModelEngine.html | 8 +- docs/api/com/johnsnowlabs/ml/util/ONNX$.html | 8 +- .../com/johnsnowlabs/ml/util/PyTorch$.html | 8 +- .../com/johnsnowlabs/ml/util/TensorFlow$.html | 8 +- .../com/johnsnowlabs/ml/util/Unknown$.html | 8 +- docs/api/com/johnsnowlabs/ml/util/index.html | 8 +- .../johnsnowlabs/nlp/ActivationFunction$.html | 8 +- .../nlp/Annotation$$AnnotationContainer.html | 8 +- ...nnotation$$extractors$$AnnotationData.html | 8 +- .../nlp/Annotation$$extractors$.html | 8 +- .../api/com/johnsnowlabs/nlp/Annotation$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Annotation.html | 8 +- .../AnnotationAudio$$AnnotationContainer.html | 8 +- .../nlp/AnnotationAudio$$AudioFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationAudio$.html | 8 +- .../com/johnsnowlabs/nlp/AnnotationAudio.html | 8 +- .../AnnotationImage$$AnnotationContainer.html | 8 +- .../nlp/AnnotationImage$$ImageFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationImage$.html | 8 +- .../com/johnsnowlabs/nlp/AnnotationImage.html | 8 +- .../johnsnowlabs/nlp/AnnotatorApproach.html | 8 +- .../com/johnsnowlabs/nlp/AnnotatorModel.html | 10 +- .../com/johnsnowlabs/nlp/AnnotatorType$.html | 8 +- .../com/johnsnowlabs/nlp/AudioAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/AudioAssembler.html | 8 +- docs/api/com/johnsnowlabs/nlp/CanBeLazy.html | 10 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk.html | 8 +- .../johnsnowlabs/nlp/DocumentAssembler$.html | 8 +- .../johnsnowlabs/nlp/DocumentAssembler.html | 8 +- .../johnsnowlabs/nlp/EmbeddingsFinisher$.html | 8 +- .../johnsnowlabs/nlp/EmbeddingsFinisher.html | 8 +- .../com/johnsnowlabs/nlp/FeaturesReader.html | 8 +- .../com/johnsnowlabs/nlp/FeaturesWriter.html | 8 +- docs/api/com/johnsnowlabs/nlp/Finisher$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Finisher.html | 8 +- .../com/johnsnowlabs/nlp/GraphFinisher.html | 8 +- .../nlp/HasAudioFeatureProperties.html | 8 +- .../johnsnowlabs/nlp/HasBatchedAnnotate.html | 10 +- .../nlp/HasBatchedAnnotateAudio.html | 8 +- .../nlp/HasBatchedAnnotateImage.html | 8 +- .../nlp/HasCandidateLabelsProperties.html | 10 +- .../nlp/HasCaseSensitiveProperties.html | 10 +- .../HasClassifierActivationProperties.html | 10 +- .../nlp/HasEnableCachingProperties.html | 8 +- docs/api/com/johnsnowlabs/nlp/HasEngine.html | 10 +- .../api/com/johnsnowlabs/nlp/HasFeatures.html | 10 +- .../nlp/HasImageFeatureProperties.html | 8 +- .../nlp/HasInputAnnotationCols.html | 10 +- .../nlp/HasMultipleInputAnnotationCols.html | 8 +- .../nlp/HasOutputAnnotationCol.html | 10 +- .../nlp/HasOutputAnnotatorType.html | 10 +- .../com/johnsnowlabs/nlp/HasPretrained.html | 10 +- .../HasProtectedParams$ProtectedParam.html | 8 +- .../johnsnowlabs/nlp/HasProtectedParams.html | 8 +- .../com/johnsnowlabs/nlp/HasRecursiveFit.html | 8 +- .../nlp/HasRecursiveTransform.html | 8 +- .../johnsnowlabs/nlp/HasSimpleAnnotate.html | 8 +- .../api/com/johnsnowlabs/nlp/IAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/ImageAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/ImageAssembler.html | 8 +- .../com/johnsnowlabs/nlp/JavaAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/LightPipeline.html | 8 +- .../nlp/MultiDocumentAssembler$.html | 8 +- .../nlp/MultiDocumentAssembler.html | 8 +- .../nlp/ParamsAndFeaturesReadable.html | 10 +- .../nlp/ParamsAndFeaturesWritable.html | 10 +- .../com/johnsnowlabs/nlp/RawAnnotator.html | 10 +- .../johnsnowlabs/nlp/RecursivePipeline.html | 8 +- .../nlp/RecursivePipelineModel.html | 8 +- docs/api/com/johnsnowlabs/nlp/SparkNLP$.html | 8 +- .../com/johnsnowlabs/nlp/TableAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/TableAssembler.html | 8 +- .../com/johnsnowlabs/nlp/TokenAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/TokenAssembler.html | 8 +- .../nlp/annotators/Chunk2Doc$.html | 8 +- .../nlp/annotators/Chunk2Doc.html | 8 +- .../nlp/annotators/ChunkTokenizer$.html | 8 +- .../nlp/annotators/ChunkTokenizer.html | 8 +- .../nlp/annotators/ChunkTokenizerModel$.html | 8 +- .../nlp/annotators/ChunkTokenizerModel.html | 8 +- .../johnsnowlabs/nlp/annotators/Chunker$.html | 8 +- .../johnsnowlabs/nlp/annotators/Chunker.html | 8 +- .../nlp/annotators/Date2Chunk$.html | 8 +- .../nlp/annotators/Date2Chunk.html | 8 +- .../nlp/annotators/DateMatcher$.html | 8 +- .../nlp/annotators/DateMatcher.html | 8 +- .../nlp/annotators/DateMatcherTranslator.html | 8 +- .../DateMatcherTranslatorPolicy.html | 8 +- .../nlp/annotators/DateMatcherUtils.html | 8 +- .../nlp/annotators/DocumentNormalizer$.html | 8 +- .../nlp/annotators/DocumentNormalizer.html | 8 +- .../nlp/annotators/EnglishStemmer$.html | 8 +- .../nlp/annotators/GraphExtraction.html | 8 +- .../nlp/annotators/Lemmatizer$.html | 8 +- .../nlp/annotators/Lemmatizer.html | 8 +- .../nlp/annotators/LemmatizerModel$.html | 8 +- .../nlp/annotators/LemmatizerModel.html | 8 +- .../nlp/annotators/LookAroundManager$.html | 8 +- .../nlp/annotators/MultiDateMatcher$.html | 8 +- .../nlp/annotators/MultiDateMatcher.html | 8 +- .../nlp/annotators/MultiDatePolicy$.html | 8 +- .../nlp/annotators/NGramGenerator$.html | 8 +- .../nlp/annotators/NGramGenerator.html | 8 +- .../nlp/annotators/Normalizer$.html | 8 +- .../nlp/annotators/Normalizer.html | 8 +- .../nlp/annotators/NormalizerModel$.html | 8 +- ...alizerModel$TokenizerAndNormalizerMap.html | 8 +- .../nlp/annotators/NormalizerModel.html | 8 +- .../annotators/PretrainedAnnotations$.html | 8 +- .../ReadablePretrainedLemmatizer.html | 8 +- ...adablePretrainedStopWordsCleanerModel.html | 8 +- .../ReadablePretrainedTextMatcher.html | 8 +- .../ReadablePretrainedTokenizer.html | 8 +- .../nlp/annotators/RecursiveTokenizer.html | 8 +- .../annotators/RecursiveTokenizerModel$.html | 8 +- .../annotators/RecursiveTokenizerModel.html | 8 +- .../nlp/annotators/RegexMatcher$.html | 8 +- .../nlp/annotators/RegexMatcher.html | 8 +- .../nlp/annotators/RegexMatcherModel$.html | 8 +- .../nlp/annotators/RegexMatcherModel.html | 8 +- .../nlp/annotators/RegexTokenizer$.html | 8 +- .../nlp/annotators/RegexTokenizer.html | 8 +- .../nlp/annotators/SingleDatePolicy$.html | 8 +- .../johnsnowlabs/nlp/annotators/Stemmer$.html | 8 +- .../johnsnowlabs/nlp/annotators/Stemmer.html | 8 +- .../nlp/annotators/StopWordsCleaner$.html | 8 +- .../nlp/annotators/StopWordsCleaner.html | 8 +- .../nlp/annotators/TextMatcher$.html | 8 +- .../nlp/annotators/TextMatcher.html | 8 +- .../nlp/annotators/TextMatcherModel$.html | 8 +- .../nlp/annotators/TextMatcherModel.html | 8 +- .../nlp/annotators/Token2Chunk$.html | 8 +- .../nlp/annotators/Token2Chunk.html | 8 +- .../nlp/annotators/Tokenizer$.html | 8 +- .../nlp/annotators/Tokenizer.html | 8 +- .../nlp/annotators/TokenizerModel$.html | 8 +- .../nlp/annotators/TokenizerModel.html | 8 +- .../nlp/annotators/audio/HubertForCTC$.html | 8 +- .../nlp/annotators/audio/HubertForCTC.html | 8 +- .../audio/ReadHubertForAudioDLModel.html | 8 +- .../audio/ReadWav2Vec2ForAudioDLModel.html | 8 +- ...ReadablePretrainedHubertForAudioModel.html | 8 +- ...adablePretrainedWav2Vec2ForAudioModel.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC$.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC.html | 8 +- .../nlp/annotators/audio/index.html | 8 +- .../nlp/annotators/btm/BigTextMatcher$.html | 8 +- .../nlp/annotators/btm/BigTextMatcher.html | 8 +- .../annotators/btm/BigTextMatcherModel$.html | 8 +- .../annotators/btm/BigTextMatcherModel.html | 8 +- .../btm/ReadablePretrainedBigTextMatcher.html | 8 +- .../nlp/annotators/btm/TMEdgesReadWriter.html | 8 +- .../nlp/annotators/btm/TMEdgesReader.html | 8 +- .../nlp/annotators/btm/TMNodesReader.html | 8 +- .../nlp/annotators/btm/TMNodesWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReadWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReader.html | 8 +- .../nlp/annotators/btm/TrieNode.html | 8 +- .../nlp/annotators/btm/index.html | 8 +- .../dl/AlbertForQuestionAnswering$.html | 20 +- .../dl/AlbertForQuestionAnswering.html | 20 +- .../dl/AlbertForSequenceClassification$.html | 20 +- .../dl/AlbertForSequenceClassification.html | 20 +- .../dl/AlbertForTokenClassification$.html | 20 +- .../dl/AlbertForTokenClassification.html | 20 +- .../dl/BertForQuestionAnswering$.html | 20 +- .../dl/BertForQuestionAnswering.html | 20 +- .../dl/BertForSequenceClassification$.html | 20 +- .../dl/BertForSequenceClassification.html | 20 +- .../dl/BertForTokenClassification$.html | 20 +- .../dl/BertForTokenClassification.html | 20 +- .../dl/BertForZeroShotClassification$.html | 20 +- .../dl/BertForZeroShotClassification.html | 20 +- .../dl/CamemBertForQuestionAnswering$.html | 20 +- .../dl/CamemBertForQuestionAnswering.html | 20 +- .../CamemBertForSequenceClassification$.html | 20 +- .../CamemBertForSequenceClassification.html | 20 +- .../dl/CamemBertForTokenClassification$.html | 20 +- .../dl/CamemBertForTokenClassification.html | 20 +- .../classifier/dl/ClassifierDLApproach$.html | 20 +- .../classifier/dl/ClassifierDLApproach.html | 20 +- .../classifier/dl/ClassifierDLModel$.html | 20 +- .../classifier/dl/ClassifierDLModel.html | 20 +- .../classifier/dl/ClassifierEncoder.html | 20 +- .../classifier/dl/ClassifierMetrics.html | 20 +- .../dl/DeBertaForQuestionAnswering$.html | 20 +- .../dl/DeBertaForQuestionAnswering.html | 20 +- .../dl/DeBertaForSequenceClassification$.html | 20 +- .../dl/DeBertaForSequenceClassification.html | 20 +- .../dl/DeBertaForTokenClassification$.html | 20 +- .../dl/DeBertaForTokenClassification.html | 20 +- .../dl/DistilBertForQuestionAnswering$.html | 20 +- .../dl/DistilBertForQuestionAnswering.html | 20 +- .../DistilBertForSequenceClassification$.html | 20 +- .../DistilBertForSequenceClassification.html | 20 +- .../dl/DistilBertForTokenClassification$.html | 20 +- .../dl/DistilBertForTokenClassification.html | 20 +- .../DistilBertForZeroShotClassification$.html | 20 +- .../DistilBertForZeroShotClassification.html | 20 +- .../dl/LongformerForQuestionAnswering$.html | 20 +- .../dl/LongformerForQuestionAnswering.html | 20 +- .../LongformerForSequenceClassification$.html | 20 +- .../LongformerForSequenceClassification.html | 20 +- .../dl/LongformerForTokenClassification$.html | 20 +- .../dl/LongformerForTokenClassification.html | 20 +- .../dl/MultiClassifierDLApproach.html | 20 +- .../dl/MultiClassifierDLModel$.html | 20 +- .../classifier/dl/MultiClassifierDLModel.html | 20 +- ...ReadAlbertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadAlbertForSequenceDLModel.html | 20 +- .../dl/ReadAlbertForTokenDLModel.html | 20 +- .../ReadBertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadBertForSequenceDLModel.html | 20 +- .../dl/ReadBertForTokenDLModel.html | 20 +- .../dl/ReadBertForZeroShotDLModel.html | 20 +- .../dl/ReadCamemBertForQADLModel.html | 20 +- .../dl/ReadCamemBertForSequenceDLModel.html | 20 +- .../dl/ReadCamemBertForTokenDLModel.html | 20 +- .../dl/ReadClassifierDLTensorflowModel.html | 20 +- ...eadDeBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadDeBertaForSequenceDLModel.html | 20 +- .../dl/ReadDeBertaForTokenDLModel.html | 20 +- ...DistilBertForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadDistilBertForSequenceDLModel.html | 20 +- .../dl/ReadDistilBertForTokenDLModel.html | 20 +- .../dl/ReadDistilBertForZeroShotDLModel.html | 20 +- ...LongformerForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadLongformerForSequenceDLModel.html | 20 +- .../dl/ReadLongformerForTokenDLModel.html | 20 +- .../ReadMultiClassifierDLTensorflowModel.html | 20 +- ...eadRoBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadRoBertaForSequenceDLModel.html | 20 +- .../dl/ReadRoBertaForTokenDLModel.html | 20 +- .../dl/ReadRoBertaForZeroShotDLModel.html | 20 +- .../dl/ReadSentimentDLTensorflowModel.html | 20 +- .../ReadTapasForQuestionAnsweringDLModel.html | 20 +- ...XlmRoBertaForQuestionAnsweringDLModel.html | 20 +- .../dl/ReadXlmRoBertaForSequenceDLModel.html | 20 +- .../dl/ReadXlmRoBertaForTokenDLModel.html | 20 +- .../dl/ReadXlmRoBertaForZeroShotDLModel.html | 1223 ++++++ .../dl/ReadXlnetForSequenceDLModel.html | 20 +- .../dl/ReadXlnetForTokenDLModel.html | 20 +- .../ReadablePretrainedAlbertForQAModel.html | 20 +- ...dablePretrainedAlbertForSequenceModel.html | 20 +- ...ReadablePretrainedAlbertForTokenModel.html | 20 +- .../dl/ReadablePretrainedBertForQAModel.html | 20 +- ...eadablePretrainedBertForSequenceModel.html | 20 +- .../ReadablePretrainedBertForTokenModel.html | 20 +- ...eadablePretrainedBertForZeroShotModel.html | 20 +- ...ReadablePretrainedCamemBertForQAModel.html | 20 +- ...lePretrainedCamemBertForSequenceModel.html | 20 +- ...dablePretrainedCamemBertForTokenModel.html | 20 +- .../dl/ReadablePretrainedClassifierDL.html | 20 +- .../ReadablePretrainedDeBertaForQAModel.html | 20 +- ...ablePretrainedDeBertaForSequenceModel.html | 20 +- ...eadablePretrainedDeBertaForTokenModel.html | 20 +- ...eadablePretrainedDistilBertForQAModel.html | 20 +- ...ePretrainedDistilBertForSequenceModel.html | 20 +- ...ablePretrainedDistilBertForTokenModel.html | 20 +- ...ePretrainedDistilBertForZeroShotModel.html | 20 +- ...eadablePretrainedLongformerForQAModel.html | 20 +- ...ePretrainedLongformerForSequenceModel.html | 20 +- ...ablePretrainedLongformerForTokenModel.html | 20 +- .../ReadablePretrainedMultiClassifierDL.html | 20 +- .../ReadablePretrainedRoBertaForQAModel.html | 20 +- ...ablePretrainedRoBertaForSequenceModel.html | 20 +- ...eadablePretrainedRoBertaForTokenModel.html | 20 +- ...ablePretrainedRoBertaForZeroShotModel.html | 20 +- .../dl/ReadablePretrainedSentimentDL.html | 20 +- .../dl/ReadablePretrainedTapasForQAModel.html | 20 +- ...eadablePretrainedXlmRoBertaForQAModel.html | 20 +- ...ePretrainedXlmRoBertaForSequenceModel.html | 20 +- ...ablePretrainedXlmRoBertaForTokenModel.html | 20 +- ...ePretrainedXlmRoBertaForZeroShotModel.html | 1249 ++++++ ...adablePretrainedXlnetForSequenceModel.html | 20 +- .../ReadablePretrainedXlnetForTokenModel.html | 20 +- .../dl/RoBertaForQuestionAnswering$.html | 20 +- .../dl/RoBertaForQuestionAnswering.html | 20 +- .../dl/RoBertaForSequenceClassification$.html | 20 +- .../dl/RoBertaForSequenceClassification.html | 20 +- .../dl/RoBertaForTokenClassification$.html | 20 +- .../dl/RoBertaForTokenClassification.html | 20 +- .../dl/RoBertaForZeroShotClassification$.html | 20 +- .../dl/RoBertaForZeroShotClassification.html | 20 +- .../classifier/dl/SentimentApproach$.html | 20 +- .../classifier/dl/SentimentDLApproach.html | 20 +- .../classifier/dl/SentimentDLModel$.html | 20 +- .../classifier/dl/SentimentDLModel.html | 20 +- .../dl/TapasForQuestionAnswering$.html | 20 +- .../dl/TapasForQuestionAnswering.html | 20 +- .../dl/XlmRoBertaForQuestionAnswering$.html | 20 +- .../dl/XlmRoBertaForQuestionAnswering.html | 20 +- .../XlmRoBertaForSequenceClassification$.html | 20 +- .../XlmRoBertaForSequenceClassification.html | 20 +- .../dl/XlmRoBertaForTokenClassification$.html | 20 +- .../dl/XlmRoBertaForTokenClassification.html | 20 +- .../XlmRoBertaForZeroShotClassification$.html | 1404 +++++++ .../XlmRoBertaForZeroShotClassification.html | 3465 +++++++++++++++++ .../dl/XlnetForSequenceClassification$.html | 20 +- .../dl/XlnetForSequenceClassification.html | 20 +- .../dl/XlnetForTokenClassification$.html | 20 +- .../dl/XlnetForTokenClassification.html | 20 +- .../nlp/annotators/classifier/dl/index.html | 133 +- .../nlp/annotators/classifier/index.html | 8 +- .../nlp/annotators/common/Annotated$.html | 8 +- .../nlp/annotators/common/Annotated.html | 8 +- .../nlp/annotators/common/ChunkSplit$.html | 8 +- .../nlp/annotators/common/ConllSentence.html | 8 +- .../DatasetHelpers$$DataFrameHelper.html | 8 +- .../annotators/common/DatasetHelpers$.html | 8 +- .../annotators/common/DependencyParsed$.html | 8 +- .../common/DependencyParsedSentence.html | 8 +- .../common/EmbeddingsWithSentence$.html | 8 +- .../annotators/common/IndexedTaggedWord.html | 8 +- .../nlp/annotators/common/IndexedToken.html | 8 +- .../nlp/annotators/common/InfixToken$.html | 8 +- .../nlp/annotators/common/InfixToken.html | 8 +- .../LabeledDependency$$DependencyInfo.html | 8 +- .../annotators/common/LabeledDependency$.html | 8 +- .../nlp/annotators/common/NerTagged$.html | 8 +- .../nlp/annotators/common/PosTagged$.html | 8 +- .../nlp/annotators/common/PrefixedToken$.html | 8 +- .../nlp/annotators/common/PrefixedToken.html | 8 +- .../common/PreprocessingParser.html | 8 +- .../nlp/annotators/common/Sentence$.html | 8 +- .../nlp/annotators/common/Sentence.html | 8 +- .../nlp/annotators/common/SentenceSplit$.html | 8 +- .../nlp/annotators/common/SuffixedToken$.html | 8 +- .../nlp/annotators/common/SuffixedToken.html | 8 +- .../nlp/annotators/common/TableData$.html | 8 +- .../nlp/annotators/common/TableData.html | 8 +- .../nlp/annotators/common/Tagged.html | 8 +- .../annotators/common/TaggedSentence$.html | 8 +- .../nlp/annotators/common/TaggedSentence.html | 8 +- .../nlp/annotators/common/TaggedWord.html | 8 +- .../nlp/annotators/common/TokenPiece.html | 8 +- .../common/TokenPieceEmbeddings$.html | 8 +- .../common/TokenPieceEmbeddings.html | 8 +- .../annotators/common/TokenizedSentence.html | 8 +- .../common/TokenizedWithSentence$.html | 8 +- .../annotators/common/WordWithDependency.html | 8 +- .../common/WordpieceEmbeddingsSentence$.html | 8 +- .../common/WordpieceEmbeddingsSentence.html | 8 +- .../common/WordpieceTokenized$.html | 8 +- .../common/WordpieceTokenizedSentence.html | 8 +- .../nlp/annotators/common/index.html | 8 +- .../ReadSpanBertCorefTensorflowModel.html | 8 +- .../ReadablePretrainedSpanBertCorefModel.html | 8 +- .../annotators/coref/SpanBertCorefModel$.html | 8 +- .../annotators/coref/SpanBertCorefModel.html | 8 +- .../nlp/annotators/coref/index.html | 8 +- .../cv/ConvNextForImageClassification$.html | 8 +- .../cv/ConvNextForImageClassification.html | 8 +- .../cv/ReadConvNextForImageDLModel.html | 8 +- .../cv/ReadSwinForImageDLModel.html | 8 +- .../annotators/cv/ReadViTForImageDLModel.html | 8 +- ...adablePretrainedConvNextForImageModel.html | 8 +- .../ReadablePretrainedSwinForImageModel.html | 8 +- .../ReadablePretrainedViTForImageModel.html | 8 +- .../cv/SwinForImageClassification$.html | 8 +- .../cv/SwinForImageClassification.html | 8 +- .../cv/ViTForImageClassification$.html | 8 +- .../cv/ViTForImageClassification.html | 8 +- .../johnsnowlabs/nlp/annotators/cv/index.html | 8 +- .../er/AhoCorasickAutomaton$Node.html | 8 +- .../annotators/er/AhoCorasickAutomaton.html | 8 +- .../nlp/annotators/er/EntityPattern.html | 8 +- .../annotators/er/EntityRulerApproach.html | 8 +- .../annotators/er/EntityRulerFeatures.html | 8 +- .../nlp/annotators/er/EntityRulerModel$.html | 8 +- .../nlp/annotators/er/EntityRulerModel.html | 8 +- .../nlp/annotators/er/EntityRulerUtil$.html | 8 +- .../annotators/er/FlattenEntityPattern.html | 8 +- .../nlp/annotators/er/PatternsReadWriter.html | 8 +- .../nlp/annotators/er/PatternsReader.html | 8 +- .../er/ReadablePretrainedEntityRuler.html | 8 +- .../er/RegexPatternsReadWriter.html | 8 +- .../annotators/er/RegexPatternsReader.html | 8 +- .../johnsnowlabs/nlp/annotators/er/index.html | 8 +- .../johnsnowlabs/nlp/annotators/index.html | 8 +- .../nlp/annotators/keyword/index.html | 8 +- .../keyword/yake/YakeKeywordExtraction$.html | 8 +- .../keyword/yake/YakeKeywordExtraction.html | 8 +- .../annotators/keyword/yake/YakeParams.html | 8 +- .../nlp/annotators/keyword/yake/index.html | 8 +- .../annotators/keyword/yake/util/Token.html | 8 +- .../keyword/yake/util/Utilities$.html | 8 +- .../annotators/keyword/yake/util/index.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL$.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL.html | 8 +- ...ReadLanguageDetectorDLTensorflowModel.html | 8 +- ...ablePretrainedLanguageDetectorDLModel.html | 8 +- .../nlp/annotators/ld/dl/index.html | 8 +- .../johnsnowlabs/nlp/annotators/ld/index.html | 8 +- .../nlp/annotators/ner/ModelMetrics$.html | 8 +- .../nlp/annotators/ner/NamedEntity.html | 8 +- .../nlp/annotators/ner/NerApproach.html | 8 +- .../nlp/annotators/ner/NerConverter$.html | 8 +- .../nlp/annotators/ner/NerConverter.html | 8 +- .../nlp/annotators/ner/NerOverwriter$.html | 8 +- .../nlp/annotators/ner/NerOverwriter.html | 8 +- .../nlp/annotators/ner/NerTagsEncoding$.html | 8 +- .../nlp/annotators/ner/Verbose$.html | 8 +- .../ner/crf/DictionaryFeatures$.html | 8 +- .../ner/crf/DictionaryFeatures.html | 8 +- .../ner/crf/FeatureGenerator$TokenType$.html | 8 +- .../annotators/ner/crf/FeatureGenerator.html | 8 +- .../annotators/ner/crf/NerCrfApproach$.html | 8 +- .../annotators/ner/crf/NerCrfApproach.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel$.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel.html | 8 +- .../ner/crf/ReadablePretrainedNerCrf.html | 8 +- .../nlp/annotators/ner/crf/index.html | 8 +- .../nlp/annotators/ner/dl/LoadsContrib$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel$.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel.html | 8 +- .../ner/dl/NerDLModelPythonReader$.html | 8 +- .../ner/dl/ReadZeroShotNerDLModel.html | 8 +- .../ner/dl/ReadablePretrainedNerDL.html | 8 +- .../ner/dl/ReadablePretrainedZeroShotNer.html | 8 +- .../nlp/annotators/ner/dl/ReadsNERGraph.html | 8 +- .../annotators/ner/dl/WithGraphResolver.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel$.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel.html | 8 +- .../nlp/annotators/ner/dl/index.html | 8 +- .../nlp/annotators/ner/index.html | 8 +- ...lizableFormat$$SerializableDateFormat.html | 8 +- .../AnnotatorParam$SerializableFormat$.html | 8 +- .../nlp/annotators/param/AnnotatorParam.html | 8 +- .../annotators/param/EvaluationDLParams.html | 8 +- .../param/ExternalResourceParam.html | 8 +- .../param/SerializedAnnotatorComponent.html | 8 +- .../param/WritableAnnotatorComponent.html | 8 +- .../nlp/annotators/param/index.html | 8 +- .../parser/dep/DependencyParserApproach$.html | 8 +- .../parser/dep/DependencyParserApproach.html | 8 +- .../parser/dep/DependencyParserModel$.html | 8 +- .../parser/dep/DependencyParserModel.html | 8 +- .../GreedyTransition/DependencyMaker$.html | 8 +- .../DependencyMaker$CurrentState.html | 8 +- .../DependencyMaker$ParseState.html | 8 +- .../dep/GreedyTransition/DependencyMaker.html | 8 +- .../GreedyTransitionApproach$.html | 8 +- .../parser/dep/GreedyTransition/index.html | 8 +- .../GreedyTransition/package$$Feature.html | 8 +- .../GreedyTransition/package$$WordData.html | 8 +- .../parser/dep/Perceptron$WeightLearner.html | 8 +- .../nlp/annotators/parser/dep/Perceptron.html | 8 +- .../dep/ReadablePretrainedDependency.html | 8 +- .../annotators/parser/dep/TagDictionary$.html | 8 +- .../nlp/annotators/parser/dep/Tagger$.html | 8 +- .../nlp/annotators/parser/dep/Tagger.html | 8 +- .../nlp/annotators/parser/dep/index.html | 8 +- .../nlp/annotators/parser/index.html | 8 +- .../annotators/parser/typdep/ConllData.html | 8 +- .../parser/typdep/DependencyArcList.html | 8 +- .../parser/typdep/DependencyInstance.html | 8 +- .../parser/typdep/DependencyPipe.html | 8 +- .../parser/typdep/LocalFeatureData.html | 8 +- .../parser/typdep/LowRankTensor.html | 8 +- .../nlp/annotators/parser/typdep/Options.html | 8 +- .../annotators/parser/typdep/Parameters.html | 8 +- .../parser/typdep/PredictionParameters.html | 8 +- .../ReadablePretrainedTypedDependency.html | 8 +- .../parser/typdep/TrainDependencies.html | 8 +- .../annotators/parser/typdep/TrainFile.html | 8 +- .../parser/typdep/TypedDependencyParser.html | 8 +- .../TypedDependencyParserApproach$.html | 8 +- .../typdep/TypedDependencyParserApproach.html | 8 +- .../typdep/TypedDependencyParserModel$.html | 8 +- .../typdep/TypedDependencyParserModel.html | 8 +- .../typdep/feature/FeatureTemplate.html | 8 +- .../feature/SyntacticFeatureFactory.html | 8 +- .../parser/typdep/feature/index.html | 8 +- .../nlp/annotators/parser/typdep/index.html | 8 +- .../parser/typdep/io/Conll09Reader.html | 8 +- .../parser/typdep/io/ConllUReader.html | 8 +- .../parser/typdep/io/ConllWriter.html | 8 +- .../parser/typdep/io/DependencyReader.html | 8 +- .../annotators/parser/typdep/io/index.html | 8 +- .../parser/typdep/util/Alphabet.html | 8 +- .../parser/typdep/util/Collector.html | 8 +- .../parser/typdep/util/DependencyLabel.html | 8 +- .../parser/typdep/util/Dictionary.html | 8 +- .../parser/typdep/util/DictionarySet.html | 8 +- .../parser/typdep/util/FeatureVector.html | 8 +- .../parser/typdep/util/ScoreCollector.html | 8 +- .../annotators/parser/typdep/util/Utils.html | 8 +- .../annotators/parser/typdep/util/index.html | 8 +- .../nlp/annotators/pos/index.html | 8 +- .../pos/perceptron/AveragedPerceptron.html | 8 +- .../pos/perceptron/PerceptronApproach$.html | 8 +- .../pos/perceptron/PerceptronApproach.html | 8 +- .../PerceptronApproachDistributed$.html | 8 +- .../PerceptronApproachDistributed.html | 8 +- .../pos/perceptron/PerceptronModel$.html | 8 +- .../pos/perceptron/PerceptronModel.html | 8 +- .../perceptron/PerceptronPredictionUtils.html | 8 +- .../perceptron/PerceptronTrainingUtils.html | 8 +- .../pos/perceptron/PerceptronUtils.html | 8 +- .../ReadablePretrainedPerceptron.html | 8 +- .../StringMapStringDoubleAccumulator.html | 8 +- .../perceptron/TrainingPerceptronLegacy.html | 8 +- .../TupleKeyLongDoubleMapAccumulator.html | 8 +- .../nlp/annotators/pos/perceptron/index.html | 8 +- .../sbd/SentenceDetectorParams.html | 8 +- .../nlp/annotators/sbd/index.html | 8 +- .../sbd/pragmatic/CustomPragmaticMethod.html | 8 +- .../sbd/pragmatic/DefaultPragmaticMethod.html | 8 +- .../sbd/pragmatic/MixedPragmaticMethod.html | 8 +- .../pragmatic/PragmaticContentFormatter$.html | 8 +- .../pragmatic/PragmaticContentFormatter.html | 8 +- .../sbd/pragmatic/PragmaticDictionaries$.html | 8 +- .../sbd/pragmatic/PragmaticMethod.html | 8 +- .../pragmatic/PragmaticSentenceExtractor.html | 8 +- .../sbd/pragmatic/PragmaticSymbols$.html | 8 +- .../annotators/sbd/pragmatic/RuleSymbols.html | 8 +- .../sbd/pragmatic/SentenceDetector$.html | 8 +- .../sbd/pragmatic/SentenceDetector.html | 8 +- .../nlp/annotators/sbd/pragmatic/index.html | 8 +- .../nlp/annotators/sda/index.html | 8 +- .../sda/pragmatic/PragmaticScorer.html | 8 +- .../sda/pragmatic/SentimentDetector$.html | 8 +- .../sda/pragmatic/SentimentDetector.html | 8 +- .../pragmatic/SentimentDetectorModel$.html | 8 +- .../sda/pragmatic/SentimentDetectorModel.html | 8 +- .../nlp/annotators/sda/pragmatic/index.html | 8 +- .../sda/vivekn/ReadablePretrainedVivekn.html | 8 +- .../sda/vivekn/ViveknSentimentApproach.html | 8 +- .../sda/vivekn/ViveknSentimentModel$.html | 8 +- .../sda/vivekn/ViveknSentimentModel.html | 8 +- .../sda/vivekn/ViveknSentimentUtils.html | 8 +- .../nlp/annotators/sda/vivekn/index.html | 8 +- .../sentence_detector_dl/Metrics.html | 8 +- .../ReadablePretrainedSentenceDetectorDL.html | 8 +- .../ReadsSentenceDetectorDLGraph.html | 8 +- .../SentenceDetectorDLApproach.html | 8 +- .../SentenceDetectorDLEncoder$.html | 8 +- .../SentenceDetectorDLEncoder.html | 8 +- .../SentenceDetectorDLEncoderParam.html | 8 +- .../SentenceDetectorDLModel$.html | 8 +- .../SentenceDetectorDLModel.html | 8 +- .../sentence_detector_dl/index.html | 8 +- .../annotators/seq2seq/BartTransformer$.html | 8 +- .../annotators/seq2seq/BartTransformer.html | 8 +- .../annotators/seq2seq/GPT2Transformer$.html | 8 +- .../annotators/seq2seq/GPT2Transformer.html | 8 +- .../seq2seq/MarianTransformer$.html | 8 +- .../annotators/seq2seq/MarianTransformer.html | 24 +- .../seq2seq/ReadBartTransformerDLModel.html | 8 +- .../seq2seq/ReadGPT2TransformerDLModel.html | 8 +- .../seq2seq/ReadMarianMTDLModel.html | 8 +- .../seq2seq/ReadT5TransformerDLModel.html | 8 +- ...eadablePretrainedBartTransformerModel.html | 8 +- ...eadablePretrainedGPT2TransformerModel.html | 8 +- .../ReadablePretrainedMarianMTModel.html | 8 +- .../ReadablePretrainedT5TransformerModel.html | 8 +- .../annotators/seq2seq/T5Transformer$.html | 8 +- .../nlp/annotators/seq2seq/T5Transformer.html | 8 +- .../nlp/annotators/seq2seq/index.html | 8 +- .../DocumentSimilarityRankerApproach$.html | 8 +- .../DocumentSimilarityRankerApproach.html | 8 +- .../DocumentSimilarityRankerModel$.html | 8 +- .../DocumentSimilarityRankerModel.html | 8 +- .../similarity/IndexedNeighbors.html | 8 +- .../IndexedNeighborsWithDistance.html | 8 +- .../similarity/NeighborAnnotation.html | 8 +- .../similarity/NeighborsResultSet.html | 8 +- .../ReadableDocumentSimilarityRanker.html | 8 +- .../nlp/annotators/similarity/index.html | 8 +- .../spell/context/CandidateStrategy$.html | 8 +- ...ntextSpellCheckerApproach$ArrayHelper.html | 8 +- .../context/ContextSpellCheckerApproach.html | 8 +- .../context/ContextSpellCheckerModel$.html | 8 +- .../ContextSpellCheckerModel$StringTools.html | 8 +- .../context/ContextSpellCheckerModel.html | 8 +- .../spell/context/HasTransducerFeatures.html | 8 +- .../spell/context/LangModelSentence.html | 8 +- .../ReadablePretrainedContextSpell.html | 8 +- .../context/ReadsLanguageModelGraph.html | 8 +- .../spell/context/WeightedLevenshtein.html | 8 +- .../nlp/annotators/spell/context/index.html | 8 +- .../spell/context/parser/AgeToken.html | 8 +- .../spell/context/parser/DateToken.html | 8 +- .../context/parser/GenericRegexParser.html | 8 +- .../context/parser/GenericVocabParser.html | 8 +- .../spell/context/parser/LocationClass.html | 8 +- .../spell/context/parser/MainVocab.html | 8 +- .../spell/context/parser/MedicationClass.html | 8 +- .../spell/context/parser/NamesClass.html | 8 +- .../spell/context/parser/NumberToken.html | 8 +- .../spell/context/parser/RegexParser.html | 8 +- .../context/parser/SerializableClass.html | 8 +- .../context/parser/SpecialClassParser.html | 8 +- .../context/parser/TransducerSeqFeature.html | 8 +- .../spell/context/parser/UnitToken.html | 8 +- .../spell/context/parser/VocabParser.html | 8 +- .../spell/context/parser/index.html | 8 +- .../nlp/annotators/spell/index.html | 8 +- .../spell/norvig/NorvigSweetingApproach$.html | 8 +- .../spell/norvig/NorvigSweetingApproach.html | 8 +- .../spell/norvig/NorvigSweetingModel$.html | 8 +- .../spell/norvig/NorvigSweetingModel.html | 8 +- .../spell/norvig/NorvigSweetingParams.html | 8 +- .../norvig/ReadablePretrainedNorvig.html | 8 +- .../nlp/annotators/spell/norvig/index.html | 8 +- .../ReadablePretrainedSymmetric.html | 8 +- .../symmetric/SymmetricDeleteApproach$.html | 8 +- .../symmetric/SymmetricDeleteApproach.html | 8 +- .../symmetric/SymmetricDeleteModel$.html | 8 +- .../SymmetricDeleteModel$SuggestedWord.html | 8 +- .../spell/symmetric/SymmetricDeleteModel.html | 8 +- .../symmetric/SymmetricDeleteParams.html | 8 +- .../nlp/annotators/spell/symmetric/index.html | 8 +- .../nlp/annotators/spell/util/Utilities$.html | 8 +- .../nlp/annotators/spell/util/index.html | 8 +- .../nlp/annotators/tapas/TapasCellDate$.html | 8 +- .../nlp/annotators/tapas/TapasCellDate.html | 8 +- .../nlp/annotators/tapas/TapasCellValue$.html | 8 +- .../nlp/annotators/tapas/TapasCellValue.html | 8 +- .../nlp/annotators/tapas/TapasEncoder.html | 8 +- .../nlp/annotators/tapas/TapasInputData.html | 8 +- .../tapas/TapasNumericRelation$.html | 8 +- .../tapas/TapasNumericValueSpan$.html | 8 +- .../tapas/TapasNumericValueSpan.html | 8 +- .../nlp/annotators/tapas/index.html | 8 +- .../tokenizer/bpe/BartTokenizer.html | 8 +- .../tokenizer/bpe/BpeTokenizer$.html | 8 +- .../tokenizer/bpe/Gpt2Tokenizer.html | 8 +- .../tokenizer/bpe/RobertaTokenizer.html | 8 +- .../tokenizer/bpe/SpecialToken.html | 8 +- .../nlp/annotators/tokenizer/bpe/index.html | 8 +- .../nlp/annotators/tokenizer/index.html | 8 +- .../ws/ReadablePretrainedWordSegmenter.html | 8 +- .../nlp/annotators/ws/TagsType$.html | 8 +- .../annotators/ws/WordSegmenterApproach$.html | 8 +- .../annotators/ws/WordSegmenterApproach.html | 8 +- .../annotators/ws/WordSegmenterModel$.html | 8 +- .../nlp/annotators/ws/WordSegmenterModel.html | 8 +- .../johnsnowlabs/nlp/annotators/ws/index.html | 8 +- .../nlp/embeddings/AlbertEmbeddings$.html | 46 +- .../nlp/embeddings/AlbertEmbeddings.html | 38 +- .../nlp/embeddings/BertEmbeddings$.html | 8 +- .../nlp/embeddings/BertEmbeddings.html | 8 +- .../embeddings/BertSentenceEmbeddings$.html | 8 +- .../embeddings/BertSentenceEmbeddings.html | 8 +- .../nlp/embeddings/CamemBertEmbeddings$.html | 46 +- .../nlp/embeddings/CamemBertEmbeddings.html | 38 +- .../nlp/embeddings/ChunkEmbeddings$.html | 8 +- .../nlp/embeddings/ChunkEmbeddings.html | 8 +- .../nlp/embeddings/DeBertaEmbeddings$.html | 8 +- .../nlp/embeddings/DeBertaEmbeddings.html | 8 +- .../nlp/embeddings/DistilBertEmbeddings$.html | 8 +- .../nlp/embeddings/DistilBertEmbeddings.html | 8 +- .../nlp/embeddings/Doc2VecApproach$.html | 8 +- .../nlp/embeddings/Doc2VecApproach.html | 8 +- .../nlp/embeddings/Doc2VecModel$.html | 8 +- .../nlp/embeddings/Doc2VecModel.html | 8 +- .../nlp/embeddings/E5Embeddings$.html | 8 +- .../nlp/embeddings/E5Embeddings.html | 8 +- .../nlp/embeddings/ElmoEmbeddings$.html | 8 +- .../nlp/embeddings/ElmoEmbeddings.html | 8 +- .../EmbeddingsCoverage$CoverageResult.html | 8 +- .../nlp/embeddings/EmbeddingsCoverage.html | 8 +- .../embeddings/HasEmbeddingsProperties.html | 8 +- .../nlp/embeddings/InstructorEmbeddings$.html | 8 +- .../nlp/embeddings/InstructorEmbeddings.html | 8 +- .../nlp/embeddings/LongformerEmbeddings$.html | 8 +- .../nlp/embeddings/LongformerEmbeddings.html | 8 +- .../PoolingStrategy$$AnnotatorType$.html | 8 +- .../nlp/embeddings/PoolingStrategy$.html | 8 +- .../nlp/embeddings/ReadAlbertDLModel.html | 50 +- .../nlp/embeddings/ReadBertDLModel.html | 8 +- .../embeddings/ReadBertSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadCamemBertDLModel.html | 50 +- .../nlp/embeddings/ReadDeBertaDLModel.html | 8 +- .../nlp/embeddings/ReadDistilBertDLModel.html | 8 +- .../nlp/embeddings/ReadE5DLModel.html | 8 +- .../nlp/embeddings/ReadElmoDLModel.html | 8 +- .../nlp/embeddings/ReadInstructorDLModel.html | 8 +- .../nlp/embeddings/ReadLongformerDLModel.html | 8 +- .../nlp/embeddings/ReadRobertaDLModel.html | 8 +- .../ReadRobertaSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadUSEDLModel.html | 8 +- .../nlp/embeddings/ReadXlmRobertaDLModel.html | 50 +- .../ReadXlmRobertaSentenceDLModel.html | 8 +- .../nlp/embeddings/ReadXlnetDLModel.html | 8 +- .../ReadablePretrainedAlbertModel.html | 8 +- .../ReadablePretrainedBertModel.html | 8 +- .../ReadablePretrainedBertSentenceModel.html | 8 +- .../ReadablePretrainedCamemBertModel.html | 8 +- .../ReadablePretrainedDeBertaModel.html | 8 +- .../ReadablePretrainedDistilBertModel.html | 8 +- .../embeddings/ReadablePretrainedDoc2Vec.html | 8 +- .../embeddings/ReadablePretrainedE5Model.html | 8 +- .../ReadablePretrainedElmoModel.html | 8 +- .../ReadablePretrainedInstructorModel.html | 8 +- .../ReadablePretrainedLongformerModel.html | 8 +- .../ReadablePretrainedRobertaModel.html | 8 +- ...eadablePretrainedRobertaSentenceModel.html | 8 +- .../ReadablePretrainedUSEModel.html | 8 +- .../ReadablePretrainedWord2Vec.html | 8 +- .../ReadablePretrainedWordEmbeddings.html | 8 +- .../ReadablePretrainedXlmRobertaModel.html | 8 +- ...ablePretrainedXlmRobertaSentenceModel.html | 8 +- .../ReadablePretrainedXlnetModel.html | 8 +- .../nlp/embeddings/ReadsFromBytes.html | 8 +- .../nlp/embeddings/RoBertaEmbeddings$.html | 8 +- .../nlp/embeddings/RoBertaEmbeddings.html | 8 +- .../RoBertaSentenceEmbeddings$.html | 8 +- .../embeddings/RoBertaSentenceEmbeddings.html | 8 +- .../nlp/embeddings/SentenceEmbeddings$.html | 8 +- .../nlp/embeddings/SentenceEmbeddings.html | 8 +- .../embeddings/UniversalSentenceEncoder$.html | 8 +- .../embeddings/UniversalSentenceEncoder.html | 8 +- .../nlp/embeddings/Word2VecApproach$.html | 8 +- .../nlp/embeddings/Word2VecApproach.html | 8 +- .../nlp/embeddings/Word2VecModel$.html | 8 +- .../nlp/embeddings/Word2VecModel.html | 8 +- .../nlp/embeddings/WordEmbeddings$.html | 8 +- .../nlp/embeddings/WordEmbeddings.html | 8 +- .../WordEmbeddingsBinaryIndexer$.html | 8 +- .../nlp/embeddings/WordEmbeddingsModel$.html | 8 +- .../nlp/embeddings/WordEmbeddingsModel.html | 8 +- .../nlp/embeddings/WordEmbeddingsReader.html | 8 +- .../WordEmbeddingsTextIndexer$.html | 8 +- .../nlp/embeddings/WordEmbeddingsWriter.html | 8 +- .../nlp/embeddings/XlmRoBertaEmbeddings$.html | 46 +- .../nlp/embeddings/XlmRoBertaEmbeddings.html | 38 +- .../XlmRoBertaSentenceEmbeddings$.html | 8 +- .../XlmRoBertaSentenceEmbeddings.html | 14 +- .../nlp/embeddings/XlnetEmbeddings$.html | 8 +- .../nlp/embeddings/XlnetEmbeddings.html | 8 +- .../johnsnowlabs/nlp/embeddings/index.html | 32 +- .../DocumentSimilarityRankerFinisher$.html | 8 +- .../DocumentSimilarityRankerFinisher.html | 8 +- .../com/johnsnowlabs/nlp/finisher/index.html | 8 +- .../nlp/functions$$EachAnnotations.html | 8 +- .../nlp/functions$$ExplodeAnnotations.html | 8 +- .../nlp/functions$$FilterAnnotations.html | 8 +- .../nlp/functions$$MapAnnotations.html | 8 +- docs/api/com/johnsnowlabs/nlp/functions$.html | 8 +- docs/api/com/johnsnowlabs/nlp/index.html | 8 +- .../nlp/pretrained/PretrainedPipeline$.html | 8 +- .../nlp/pretrained/PretrainedPipeline.html | 8 +- .../pretrained/PythonResourceDownloader$.html | 8 +- .../nlp/pretrained/RepositoryMetadata.html | 8 +- .../nlp/pretrained/ResourceDownloader$.html | 8 +- .../nlp/pretrained/ResourceDownloader.html | 8 +- .../nlp/pretrained/ResourceMetadata$.html | 8 +- .../nlp/pretrained/ResourceMetadata.html | 8 +- .../nlp/pretrained/ResourceRequest.html | 8 +- .../nlp/pretrained/ResourceType$.html | 8 +- .../nlp/pretrained/S3ResourceDownloader.html | 8 +- .../johnsnowlabs/nlp/pretrained/index.html | 8 +- .../com/johnsnowlabs/nlp/recursive/index.html | 8 +- .../nlp/recursive/package$$Recursive.html | 8 +- .../recursive/package$$RecursiveModel.html | 8 +- .../nlp/serialization/ArrayFeature.html | 8 +- .../nlp/serialization/Feature.html | 8 +- .../nlp/serialization/MapFeature.html | 8 +- .../SerializedExternalResource.html | 8 +- .../nlp/serialization/SetFeature.html | 8 +- .../nlp/serialization/StructFeature.html | 8 +- .../nlp/serialization/TransducerFeature.html | 8 +- .../johnsnowlabs/nlp/serialization/index.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLL.html | 8 +- .../nlp/training/CoNLL2003NerReader.html | 8 +- .../nlp/training/CoNLLDocument.html | 8 +- .../CoNLLHelper$$CoNLLSentenceCols.html | 8 +- .../training/CoNLLHelper$$CoNLLTokenCols.html | 8 +- .../nlp/training/CoNLLHelper$.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLLU.html | 8 +- .../nlp/training/CoNLLUCols$.html | 8 +- .../nlp/training/CoNLLUDocument.html | 8 +- .../com/johnsnowlabs/nlp/training/POS.html | 8 +- .../johnsnowlabs/nlp/training/PubTator.html | 8 +- .../nlp/training/SpacyToAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/training/index.html | 8 +- .../johnsnowlabs/nlp/util/FinisherUtil$.html | 8 +- .../johnsnowlabs/nlp/util/GraphBuilder.html | 8 +- .../nlp/util/LfuCache$CachedItem.html | 8 +- .../nlp/util/LfuCache$DoubleLinked.html | 8 +- .../nlp/util/LfuCache$FrequencyList.html | 8 +- .../com/johnsnowlabs/nlp/util/LfuCache.html | 8 +- .../nlp/util/LruMap$KeyPriority.html | 8 +- .../nlp/util/LruMap$KeyPriorityOrdering$.html | 8 +- .../api/com/johnsnowlabs/nlp/util/LruMap.html | 8 +- .../nlp/util/SparkNlpConfigKeys$.html | 8 +- docs/api/com/johnsnowlabs/nlp/util/index.html | 8 +- .../nlp/util/io/ExternalResource$.html | 8 +- .../nlp/util/io/ExternalResource.html | 8 +- .../nlp/util/io/MatchStrategy$.html | 8 +- .../nlp/util/io/OutputHelper$.html | 8 +- .../com/johnsnowlabs/nlp/util/io/ReadAs$.html | 8 +- .../util/io/ResourceHelper$$SourceStream.html | 8 +- .../nlp/util/io/ResourceHelper$.html | 8 +- .../com/johnsnowlabs/nlp/util/io/index.html | 8 +- .../nlp/util/regex/RegexRule.html | 8 +- .../util/regex/RuleFactory$$RuleMatch.html | 8 +- .../nlp/util/regex/RuleFactory$.html | 8 +- .../nlp/util/regex/RuleFactory.html | 8 +- .../nlp/util/regex/TransformStrategy$.html | 8 +- .../johnsnowlabs/nlp/util/regex/index.html | 8 +- .../com/johnsnowlabs/storage/BytesKey.html | 8 +- .../com/johnsnowlabs/storage/Database$.html | 8 +- .../com/johnsnowlabs/storage/Database.html | 8 +- .../johnsnowlabs/storage/HasConnection.html | 8 +- .../com/johnsnowlabs/storage/HasStorage.html | 8 +- .../johnsnowlabs/storage/HasStorageModel.html | 8 +- .../storage/HasStorageOptions.html | 8 +- .../storage/HasStorageReader.html | 8 +- .../johnsnowlabs/storage/HasStorageRef$.html | 8 +- .../johnsnowlabs/storage/HasStorageRef.html | 8 +- .../storage/RocksDBConnection$.html | 8 +- .../storage/RocksDBConnection.html | 8 +- .../storage/StorageBatchWriter.html | 8 +- .../johnsnowlabs/storage/StorageFormat.html | 8 +- .../johnsnowlabs/storage/StorageHelper$.html | 8 +- .../johnsnowlabs/storage/StorageLocator$.html | 8 +- .../johnsnowlabs/storage/StorageLocator.html | 8 +- .../storage/StorageReadWriter.html | 8 +- .../johnsnowlabs/storage/StorageReadable.html | 8 +- .../johnsnowlabs/storage/StorageReader.html | 8 +- .../johnsnowlabs/storage/StorageWriter.html | 8 +- docs/api/com/johnsnowlabs/storage/index.html | 8 +- .../api/com/johnsnowlabs/util/Benchmark$.html | 8 +- docs/api/com/johnsnowlabs/util/Build$.html | 8 +- .../johnsnowlabs/util/CoNLLGenerator$.html | 8 +- .../com/johnsnowlabs/util/ConfigHelper$.html | 8 +- .../com/johnsnowlabs/util/ConfigLoader$.html | 8 +- .../com/johnsnowlabs/util/FileHelper$.html | 8 +- .../com/johnsnowlabs/util/JsonParser$.html | 8 +- .../johnsnowlabs/util/PipelineModels$.html | 8 +- .../johnsnowlabs/util/TrainingHelper$.html | 8 +- docs/api/com/johnsnowlabs/util/Version$.html | 8 +- docs/api/com/johnsnowlabs/util/Version.html | 8 +- .../johnsnowlabs/util/ZipArchiveUtil$.html | 8 +- docs/api/com/johnsnowlabs/util/index.html | 8 +- .../util/spark/LongMapAccumulator.html | 8 +- .../util/spark/MapAccumulator.html | 8 +- .../johnsnowlabs/util/spark/SparkUtil$.html | 8 +- .../com/johnsnowlabs/util/spark/index.html | 8 +- docs/api/index.html | 8 +- docs/api/index.js | 2 +- docs/api/python/.buildinfo | 2 +- docs/api/python/genindex.html | 25 +- docs/api/python/getting_started/index.html | 20 +- docs/api/python/index.html | 2 +- docs/api/python/modules/index.html | 3 +- docs/api/python/modules/sparknlp.html | 6 +- .../python/modules/sparknlp/annotation.html | 2 +- .../modules/sparknlp/annotation_audio.html | 2 +- .../modules/sparknlp/annotation_image.html | 2 +- .../annotator/audio/hubert_for_ctc.html | 2 +- .../annotator/audio/wav2vec2_for_ctc.html | 2 +- .../sparknlp/annotator/chunk2_doc.html | 2 +- .../modules/sparknlp/annotator/chunker.html | 2 +- .../albert_for_question_answering.html | 2 +- .../albert_for_sequence_classification.html | 2 +- .../albert_for_token_classification.html | 2 +- .../bert_for_question_answering.html | 2 +- .../bert_for_sequence_classification.html | 2 +- .../bert_for_token_classification.html | 2 +- .../bert_for_zero_shot_classification.html | 2 +- .../camembert_for_question_answering.html | 2 +- ...camembert_for_sequence_classification.html | 2 +- .../camembert_for_token_classification.html | 2 +- .../classifier_dl/classifier_dl.html | 2 +- .../deberta_for_question_answering.html | 2 +- .../deberta_for_sequence_classification.html | 2 +- .../deberta_for_token_classification.html | 2 +- .../distil_bert_for_question_answering.html | 2 +- ...stil_bert_for_sequence_classification.html | 2 +- .../distil_bert_for_token_classification.html | 2 +- ...til_bert_for_zero_shot_classification.html | 2 +- .../longformer_for_question_answering.html | 2 +- ...ongformer_for_sequence_classification.html | 2 +- .../longformer_for_token_classification.html | 2 +- .../classifier_dl/multi_classifier_dl.html | 2 +- ...rta_bert_for_zero_shot_classification.html | 2 +- .../roberta_for_question_answering.html | 2 +- .../roberta_for_sequence_classification.html | 2 +- .../roberta_for_token_classification.html | 2 +- .../annotator/classifier_dl/sentiment_dl.html | 2 +- .../tapas_for_question_answering.html | 2 +- .../xlm_roberta_for_question_answering.html | 2 +- ...m_roberta_for_sequence_classification.html | 2 +- .../xlm_roberta_for_token_classification.html | 2 +- ..._roberta_for_zero_shot_classification.html | 641 +++ .../xlnet_for_sequence_classification.html | 2 +- .../xlnet_for_token_classification.html | 2 +- .../annotator/coref/spanbert_coref.html | 2 +- .../cv/convnext_for_image_classification.html | 2 +- .../cv/swin_for_image_classification.html | 2 +- .../cv/vit_for_image_classification.html | 2 +- .../sparknlp/annotator/date2_chunk.html | 2 +- .../dependency/dependency_parser.html | 2 +- .../dependency/typed_dependency_parser.html | 2 +- .../annotator/document_normalizer.html | 2 +- .../embeddings/albert_embeddings.html | 2 +- .../annotator/embeddings/bert_embeddings.html | 2 +- .../embeddings/bert_sentence_embeddings.html | 2 +- .../embeddings/camembert_embeddings.html | 2 +- .../embeddings/chunk_embeddings.html | 2 +- .../embeddings/deberta_embeddings.html | 2 +- .../embeddings/distil_bert_embeddings.html | 2 +- .../annotator/embeddings/doc2vec.html | 2 +- .../annotator/embeddings/e5_embeddings.html | 2 +- .../annotator/embeddings/elmo_embeddings.html | 2 +- .../embeddings/instructor_embeddings.html | 2 +- .../embeddings/longformer_embeddings.html | 2 +- .../embeddings/roberta_embeddings.html | 2 +- .../roberta_sentence_embeddings.html | 2 +- .../embeddings/sentence_embeddings.html | 2 +- .../universal_sentence_encoder.html | 2 +- .../annotator/embeddings/word2vec.html | 2 +- .../annotator/embeddings/word_embeddings.html | 2 +- .../embeddings/xlm_roberta_embeddings.html | 2 +- .../xlm_roberta_sentence_embeddings.html | 2 +- .../embeddings/xlnet_embeddings.html | 2 +- .../sparknlp/annotator/er/entity_ruler.html | 2 +- .../sparknlp/annotator/graph_extraction.html | 2 +- .../yake_keyword_extraction.html | 2 +- .../annotator/ld_dl/language_detector_dl.html | 2 +- .../sparknlp/annotator/lemmatizer.html | 2 +- .../annotator/matcher/big_text_matcher.html | 2 +- .../annotator/matcher/date_matcher.html | 2 +- .../annotator/matcher/multi_date_matcher.html | 2 +- .../annotator/matcher/regex_matcher.html | 2 +- .../annotator/matcher/text_matcher.html | 2 +- .../sparknlp/annotator/n_gram_generator.html | 2 +- .../sparknlp/annotator/ner/ner_approach.html | 2 +- .../sparknlp/annotator/ner/ner_converter.html | 2 +- .../sparknlp/annotator/ner/ner_crf.html | 2 +- .../sparknlp/annotator/ner/ner_dl.html | 2 +- .../annotator/ner/ner_overwriter.html | 2 +- .../annotator/ner/zero_shot_ner_model.html | 2 +- .../sparknlp/annotator/normalizer.html | 2 +- .../annotator/param/classifier_encoder.html | 2 +- .../annotator/param/evaluation_dl_params.html | 2 +- .../sparknlp/annotator/pos/perceptron.html | 2 +- .../annotator/sentence/sentence_detector.html | 2 +- .../sentence/sentence_detector_dl.html | 2 +- .../sentiment/sentiment_detector.html | 2 +- .../annotator/sentiment/vivekn_sentiment.html | 2 +- .../annotator/seq2seq/bart_transformer.html | 2 +- .../annotator/seq2seq/gpt2_transformer.html | 2 +- .../annotator/seq2seq/marian_transformer.html | 2 +- .../annotator/seq2seq/t5_transformer.html | 2 +- .../document_similarity_ranker.html | 2 +- .../spell_check/context_spell_checker.html | 2 +- .../spell_check/norvig_sweeting.html | 2 +- .../spell_check/symmetric_delete.html | 2 +- .../modules/sparknlp/annotator/stemmer.html | 2 +- .../annotator/stop_words_cleaner.html | 2 +- .../annotator/tf_ner_dl_graph_builder.html | 2 +- .../annotator/token/chunk_tokenizer.html | 2 +- .../annotator/token/recursive_tokenizer.html | 2 +- .../annotator/token/regex_tokenizer.html | 2 +- .../sparknlp/annotator/token/tokenizer.html | 2 +- .../sparknlp/annotator/ws/word_segmenter.html | 2 +- .../sparknlp/base/audio_assembler.html | 2 +- .../modules/sparknlp/base/doc2_chunk.html | 2 +- .../sparknlp/base/document_assembler.html | 2 +- .../sparknlp/base/embeddings_finisher.html | 2 +- .../modules/sparknlp/base/finisher.html | 2 +- .../modules/sparknlp/base/graph_finisher.html | 2 +- .../sparknlp/base/has_recursive_fit.html | 2 +- .../base/has_recursive_transform.html | 2 +- .../sparknlp/base/image_assembler.html | 2 +- .../modules/sparknlp/base/light_pipeline.html | 2 +- .../base/multi_document_assembler.html | 2 +- .../sparknlp/base/recursive_pipeline.html | 2 +- .../sparknlp/base/table_assembler.html | 2 +- .../modules/sparknlp/base/token2_chunk.html | 2 +- .../sparknlp/base/token_assembler.html | 2 +- .../sparknlp/common/annotator_approach.html | 2 +- .../sparknlp/common/annotator_model.html | 2 +- .../sparknlp/common/annotator_properties.html | 2 +- .../sparknlp/common/match_strategy.html | 2 +- .../modules/sparknlp/common/properties.html | 2 +- .../modules/sparknlp/common/read_as.html | 2 +- .../common/recursive_annotator_approach.html | 2 +- .../python/modules/sparknlp/common/utils.html | 2 +- .../python/modules/sparknlp/functions.html | 2 +- .../sparknlp/internal/annotator_java_ml.html | 2 +- .../internal/annotator_transformer.html | 2 +- .../internal/extended_java_wrapper.html | 2 +- .../internal/params_getters_setters.html | 2 +- .../modules/sparknlp/internal/recursive.html | 2 +- .../modules/sparknlp/logging/comet.html | 2 +- .../pretrained/pretrained_pipeline.html | 2 +- .../pretrained/resource_downloader.html | 2 +- .../modules/sparknlp/training/conll.html | 2 +- .../modules/sparknlp/training/conllu.html | 2 +- .../python/modules/sparknlp/training/pos.html | 2 +- .../modules/sparknlp/training/pub_tator.html | 2 +- .../training/spacy_to_annotation.html | 2 +- docs/api/python/objects.inv | Bin 12495 -> 12563 bytes docs/api/python/py-modindex.html | 7 +- .../sparknlp/annotation/index.html | 2 +- .../sparknlp/annotation_audio/index.html | 2 +- .../sparknlp/annotation_image/index.html | 2 +- .../annotator/audio/hubert_for_ctc/index.html | 2 +- .../sparknlp/annotator/audio/index.html | 2 +- .../audio/wav2vec2_for_ctc/index.html | 2 +- .../sparknlp/annotator/chunk2_doc/index.html | 3 +- .../sparknlp/annotator/chunker/index.html | 3 +- .../albert_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../bert_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../bert_for_token_classification/index.html | 3 +- .../index.html | 3 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../classifier_dl/classifier_dl/index.html | 3 +- .../deberta_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 3 +- .../annotator/classifier_dl/index.html | 4 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../multi_classifier_dl/index.html | 3 +- .../index.html | 3 +- .../roberta_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../classifier_dl/sentiment_dl/index.html | 3 +- .../tapas_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 803 ++++ .../index.html | 3 +- .../xlnet_for_token_classification/index.html | 3 +- .../sparknlp/annotator/coref/index.html | 2 +- .../annotator/coref/spanbert_coref/index.html | 2 +- .../index.html | 2 +- .../sparknlp/annotator/cv/index.html | 2 +- .../swin_for_image_classification/index.html | 2 +- .../vit_for_image_classification/index.html | 2 +- .../sparknlp/annotator/date2_chunk/index.html | 3 +- .../dependency/dependency_parser/index.html | 3 +- .../sparknlp/annotator/dependency/index.html | 3 +- .../typed_dependency_parser/index.html | 3 +- .../annotator/document_normalizer/index.html | 3 +- .../embeddings/albert_embeddings/index.html | 3 +- .../embeddings/bert_embeddings/index.html | 3 +- .../bert_sentence_embeddings/index.html | 3 +- .../camembert_embeddings/index.html | 3 +- .../embeddings/chunk_embeddings/index.html | 3 +- .../embeddings/deberta_embeddings/index.html | 3 +- .../distil_bert_embeddings/index.html | 3 +- .../annotator/embeddings/doc2vec/index.html | 3 +- .../embeddings/e5_embeddings/index.html | 3 +- .../embeddings/elmo_embeddings/index.html | 3 +- .../sparknlp/annotator/embeddings/index.html | 3 +- .../instructor_embeddings/index.html | 3 +- .../longformer_embeddings/index.html | 3 +- .../embeddings/roberta_embeddings/index.html | 3 +- .../roberta_sentence_embeddings/index.html | 3 +- .../embeddings/sentence_embeddings/index.html | 3 +- .../universal_sentence_encoder/index.html | 3 +- .../annotator/embeddings/word2vec/index.html | 3 +- .../embeddings/word_embeddings/index.html | 3 +- .../xlm_roberta_embeddings/index.html | 3 +- .../index.html | 3 +- .../embeddings/xlnet_embeddings/index.html | 3 +- .../annotator/er/entity_ruler/index.html | 3 +- .../sparknlp/annotator/er/index.html | 3 +- .../annotator/graph_extraction/index.html | 3 +- .../autosummary/sparknlp/annotator/index.html | 4 +- .../annotator/keyword_extraction/index.html | 3 +- .../yake_keyword_extraction/index.html | 3 +- .../sparknlp/annotator/ld_dl/index.html | 3 +- .../ld_dl/language_detector_dl/index.html | 3 +- .../sparknlp/annotator/lemmatizer/index.html | 3 +- .../matcher/big_text_matcher/index.html | 3 +- .../annotator/matcher/date_matcher/index.html | 3 +- .../sparknlp/annotator/matcher/index.html | 3 +- .../matcher/multi_date_matcher/index.html | 3 +- .../matcher/regex_matcher/index.html | 3 +- .../annotator/matcher/text_matcher/index.html | 3 +- .../annotator/n_gram_generator/index.html | 3 +- .../sparknlp/annotator/ner/index.html | 3 +- .../annotator/ner/ner_approach/index.html | 3 +- .../annotator/ner/ner_converter/index.html | 3 +- .../sparknlp/annotator/ner/ner_crf/index.html | 3 +- .../sparknlp/annotator/ner/ner_dl/index.html | 3 +- .../annotator/ner/ner_overwriter/index.html | 3 +- .../ner/zero_shot_ner_model/index.html | 2 +- .../sparknlp/annotator/normalizer/index.html | 3 +- .../param/classifier_encoder/index.html | 2 +- .../param/evaluation_dl_params/index.html | 2 +- .../sparknlp/annotator/param/index.html | 3 +- .../sparknlp/annotator/pos/index.html | 3 +- .../annotator/pos/perceptron/index.html | 3 +- .../sparknlp/annotator/sentence/index.html | 3 +- .../sentence/sentence_detector/index.html | 3 +- .../sentence/sentence_detector_dl/index.html | 3 +- .../sparknlp/annotator/sentiment/index.html | 3 +- .../sentiment/sentiment_detector/index.html | 3 +- .../sentiment/vivekn_sentiment/index.html | 3 +- .../seq2seq/bart_transformer/index.html | 3 +- .../seq2seq/gpt2_transformer/index.html | 3 +- .../sparknlp/annotator/seq2seq/index.html | 3 +- .../seq2seq/marian_transformer/index.html | 3 +- .../seq2seq/t5_transformer/index.html | 3 +- .../document_similarity_ranker/index.html | 2 +- .../sparknlp/annotator/similarity/index.html | 2 +- .../context_spell_checker/index.html | 3 +- .../sparknlp/annotator/spell_check/index.html | 3 +- .../spell_check/norvig_sweeting/index.html | 3 +- .../spell_check/symmetric_delete/index.html | 3 +- .../sparknlp/annotator/stemmer/index.html | 3 +- .../annotator/stop_words_cleaner/index.html | 3 +- .../tf_ner_dl_graph_builder/index.html | 2 +- .../token/chunk_tokenizer/index.html | 3 +- .../sparknlp/annotator/token/index.html | 3 +- .../token/recursive_tokenizer/index.html | 3 +- .../token/regex_tokenizer/index.html | 3 +- .../annotator/token/tokenizer/index.html | 3 +- .../sparknlp/annotator/ws/index.html | 3 +- .../annotator/ws/word_segmenter/index.html | 3 +- .../sparknlp/base/audio_assembler/index.html | 2 +- .../sparknlp/base/doc2_chunk/index.html | 2 +- .../base/document_assembler/index.html | 2 +- .../base/embeddings_finisher/index.html | 2 +- .../sparknlp/base/finisher/index.html | 2 +- .../sparknlp/base/graph_finisher/index.html | 2 +- .../base/has_recursive_fit/index.html | 2 +- .../base/has_recursive_transform/index.html | 2 +- .../sparknlp/base/image_assembler/index.html | 2 +- .../autosummary/sparknlp/base/index.html | 2 +- .../sparknlp/base/light_pipeline/index.html | 2 +- .../base/multi_document_assembler/index.html | 2 +- .../base/recursive_pipeline/index.html | 2 +- .../sparknlp/base/table_assembler/index.html | 2 +- .../sparknlp/base/token2_chunk/index.html | 2 +- .../sparknlp/base/token_assembler/index.html | 2 +- .../common/annotator_approach/index.html | 2 +- .../common/annotator_model/index.html | 2 +- .../common/annotator_properties/index.html | 2 +- .../sparknlp/common/annotator_type/index.html | 2 +- .../common/coverage_result/index.html | 2 +- .../autosummary/sparknlp/common/index.html | 2 +- .../sparknlp/common/match_strategy/index.html | 2 +- .../sparknlp/common/properties/index.html | 2 +- .../sparknlp/common/read_as/index.html | 2 +- .../recursive_annotator_approach/index.html | 2 +- .../sparknlp/common/storage/index.html | 2 +- .../sparknlp/common/utils/index.html | 2 +- .../autosummary/sparknlp/functions/index.html | 2 +- .../reference/autosummary/sparknlp/index.html | 2 +- .../internal/annotator_java_ml/index.html | 2 +- .../internal/annotator_transformer/index.html | 2 +- .../internal/extended_java_wrapper/index.html | 2 +- .../autosummary/sparknlp/internal/index.html | 2 +- .../params_getters_setters/index.html | 2 +- .../sparknlp/internal/recursive/index.html | 2 +- .../sparknlp/logging/comet/index.html | 2 +- .../autosummary/sparknlp/logging/index.html | 2 +- .../sparknlp/pretrained/index.html | 2 +- .../pretrained/pretrained_pipeline/index.html | 2 +- .../pretrained/resource_downloader/index.html | 2 +- .../sparknlp/pretrained/utils/index.html | 2 +- .../sparknlp/training/conll/index.html | 2 +- .../sparknlp/training/conllu/index.html | 2 +- .../autosummary/sparknlp/training/index.html | 2 +- .../sparknlp/training/pos/index.html | 2 +- .../sparknlp/training/pub_tator/index.html | 2 +- .../training/spacy_to_annotation/index.html | 2 +- .../sparknlp/training/tfgraphs/index.html | 2 +- .../sparknlp/upload_to_hub/index.html | 2 +- .../autosummary/sparknlp/util/index.html | 2 +- docs/api/python/reference/index.html | 2 +- docs/api/python/search.html | 2 +- docs/api/python/searchindex.js | 2 +- .../python/static/documentation_options.js | 2 +- docs/api/python/third_party/Comet.html | 2 +- docs/api/python/third_party/MLflow.html | 2 +- docs/api/python/third_party/index.html | 2 +- docs/api/python/user_guide/annotation.html | 2 +- docs/api/python/user_guide/annotators.html | 2 +- .../python/user_guide/custom_pipelines.html | 2 +- docs/api/python/user_guide/helpers.html | 2 +- docs/api/python/user_guide/index.html | 2 +- .../python/user_guide/light_pipelines.html | 2 +- .../user_guide/pretrained_pipelines.html | 2 +- docs/api/python/user_guide/training.html | 2 +- .../nlp/embeddings/AlbertEmbeddings.scala | 7 +- 1354 files changed, 15399 insertions(+), 4417 deletions(-) create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/ReadXlmRoBertaForZeroShotDLModel.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/ReadablePretrainedXlmRoBertaForZeroShotModel.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.html create mode 100644 docs/api/python/modules/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification.html create mode 100644 docs/api/python/reference/autosummary/sparknlp/annotator/classifier_dl/xlm_roberta_for_zero_shot_classification/index.html diff --git a/docs/api/com/index.html b/docs/api/com/index.html index 0e735e306ba9fe..2aa77b1345b6ec 100644 --- a/docs/api/com/index.html +++ b/docs/api/com/index.html @@ -3,9 +3,9 @@ - Spark NLP 5.0.1 ScalaDoc - com - - + Spark NLP 5.0.2 ScalaDoc - com + + @@ -28,7 +28,7 @@