From dc18ef4aabef0c171384ace46518890d48a52884 Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Wed, 27 Dec 2023 15:12:55 +0100 Subject: [PATCH 01/14] SPARKNLP-955: Add missing companion object (#14088) --- .../DocumentCharacterTextSplitter.scala | 7 +++- .../DocumentCharacterTextSplitterTest.scala | 42 ++++++++++++++++++- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala index ca7f5c15d7705f..27cc0acc8665cd 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala @@ -3,7 +3,7 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.functions.ExplodeAnnotations import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate} import org.apache.spark.ml.param.{BooleanParam, IntParam, StringArrayParam} -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.DataFrame import scala.util.matching.Regex @@ -270,3 +270,8 @@ class DocumentCharacterTextSplitter(override val uid: String) } } + +/** This is the companion object of [[DocumentCharacterTextSplitter]]. Please refer to that class + * for the documentation. + */ +object DocumentCharacterTextSplitter extends DefaultParamsReadable[DocumentCharacterTextSplitter] diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala index e8179829b63e85..b554a83ac046c7 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala @@ -1,9 +1,11 @@ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.Annotation +import com.johnsnowlabs.nlp.annotator.DocumentCharacterTextSplitter import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper -import com.johnsnowlabs.tags.FastTest +import com.johnsnowlabs.tags.{FastTest, SlowTest} +import org.apache.spark.ml.Pipeline import org.apache.spark.sql.DataFrame import org.scalatest.flatspec.AnyFlatSpec @@ -221,4 +223,42 @@ class DocumentCharacterTextSplitterTest extends AnyFlatSpec { assertResult(sampleText, result, expected) } + it should "be serializable" taggedAs SlowTest in { + val textSplitter = new DocumentCharacterTextSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setChunkSize(1000) + .setChunkOverlap(100) + .setExplodeSplits(true) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter)) + val pipelineModel = pipeline.fit(splitTextDF) + + pipelineModel.stages.last + .asInstanceOf[DocumentCharacterTextSplitter] + .write + .overwrite() + .save("./tmp_textSplitter") + + val loadedTextSplitModel = DocumentCharacterTextSplitter.load("tmp_textSplitter") + + loadedTextSplitModel.transform(textDocument).select("splits").show(truncate = false) + } + + it should "be exportable to pipeline" taggedAs SlowTest in { + val textSplitter = new DocumentCharacterTextSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setChunkSize(1000) + .setChunkOverlap(100) + .setExplodeSplits(true) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter)) + pipeline.write.overwrite().save("tmp_textsplitter_pipe") + + val loadedPipelineModel = Pipeline.load("tmp_textsplitter_pipe") + + loadedPipelineModel.fit(splitTextDF).transform(splitTextDF).select("splits").show() + } + } From 4723d9c762fe739a49f25a72cb32446464a77c03 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 27 Dec 2023 16:12:57 +0100 Subject: [PATCH 02/14] Fix the missing DefaultParamsReadable in DocumentTokenSplitter [skip test] --- .../DocumentCharacterTextSplitter.scala | 19 ++++++- .../annotators/DocumentTokenSplitter.scala | 24 ++++++++- .../DocumentCharacterTextSplitterTest.scala | 16 +++++- .../DocumentTokenSplitterTest.scala | 54 +++++++++++++++++++ 4 files changed, 108 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala index 27cc0acc8665cd..20580fc9f19be4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter.scala @@ -1,3 +1,18 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.functions.ExplodeAnnotations @@ -272,6 +287,6 @@ class DocumentCharacterTextSplitter(override val uid: String) } /** This is the companion object of [[DocumentCharacterTextSplitter]]. Please refer to that class - * for the documentation. - */ + * for the documentation. + */ object DocumentCharacterTextSplitter extends DefaultParamsReadable[DocumentCharacterTextSplitter] diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala index 1acfd42d710bca..6499d584c79182 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter.scala @@ -1,12 +1,27 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators +import com.johnsnowlabs.nlp.functions.ExplodeAnnotations import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, HasSimpleAnnotate} import org.apache.spark.ml.param.{BooleanParam, IntParam} -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.DataFrame import scala.util.matching.Regex -import com.johnsnowlabs.nlp.functions.ExplodeAnnotations /** Annotator that splits large documents into smaller documents based on the number of tokens in * the text. @@ -223,3 +238,8 @@ class DocumentTokenSplitter(override val uid: String) if (getExplodeSplits) dataset.explodeAnnotationsCol(getOutputCol, getOutputCol) else dataset } } + +/** This is the companion object of [[DocumentTokenSplitter]]. Please refer to that class for the + * documentation. + */ +object DocumentTokenSplitter extends DefaultParamsReadable[DocumentTokenSplitter] diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala index b554a83ac046c7..ca7d1317c6f3f7 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitterTest.scala @@ -1,7 +1,21 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.Annotation -import com.johnsnowlabs.nlp.annotator.DocumentCharacterTextSplitter import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.{FastTest, SlowTest} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala index 036205711dc924..92e54bcaf59b49 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitterTest.scala @@ -1,9 +1,25 @@ +/* + * Copyright 2017-2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.Annotation import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.FastTest +import org.apache.spark.ml.Pipeline import org.apache.spark.sql.DataFrame import org.scalatest.flatspec.AnyFlatSpec @@ -55,4 +71,42 @@ class DocumentTokenSplitterTest extends AnyFlatSpec { } } + it should "be serializable" taggedAs FastTest in { + val numTokens = 3 + val textSplitter = new DocumentTokenSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setNumTokens(numTokens) + .setTokenOverlap(1) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter)) + val pipelineModel = pipeline.fit(textDf) + + pipelineModel.stages.last + .asInstanceOf[DocumentTokenSplitter] + .write + .overwrite() + .save("./tmp_textSplitter") + + val loadedTextSplitModel = DocumentTokenSplitter.load("tmp_textSplitter") + + loadedTextSplitModel.transform(textDocument).select("splits").show(truncate = false) + } + + it should "be exportable to pipeline" taggedAs FastTest in { + val numTokens = 3 + val textSplitter = new DocumentTokenSplitter() + .setInputCols("document") + .setOutputCol("splits") + .setNumTokens(numTokens) + .setTokenOverlap(1) + + val pipeline = new Pipeline().setStages(Array(documentAssembler, textSplitter)) + pipeline.write.overwrite().save("tmp_textsplitter_pipe") + + val loadedPipelineModel = Pipeline.load("tmp_textsplitter_pipe") + + loadedPipelineModel.fit(textDf).transform(textDf).select("splits").show() + } + } From 3c4c1e139506df0e36f2313bac574a8c3ee9cc2e Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Wed, 27 Dec 2023 16:37:16 +0100 Subject: [PATCH 03/14] SPARKNLP-951 & SPARKNLP-952: Added example notebooks for Marian and T5 (#14089) * SPARKNLP-951: Added example notebooks for Marian and T5 * Added Missing notebooks in transformers table --- docs/en/transformers.md | 176 +- .../HuggingFace in Spark NLP - T5.ipynb | 2690 +++++++++++++++++ ...HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb | 617 ++++ .../HuggingFace_ONNX_in_Spark_NLP_T5.ipynb | 902 ++++++ 4 files changed, 4303 insertions(+), 82 deletions(-) create mode 100644 examples/python/transformers/HuggingFace in Spark NLP - T5.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb diff --git a/docs/en/transformers.md b/docs/en/transformers.md index 500070d3f6eef4..aabf942f35ef53 100644 --- a/docs/en/transformers.md +++ b/docs/en/transformers.md @@ -47,69 +47,70 @@ We have extended support for `HuggingFace` 🤗 and `TF Hub` exported models s - Under development ❎ - Not supported ❌ -| Spark NLP | TF Hub | HuggingFace | ONNX | Model Architecture | -|:----------------------------------------------|:-------|:-------------|:-----|:-----------------------------------------------------------------------------------------------------------------------------------------------------------| -| AlbertEmbeddings | ✅ | ✅ | ✅ | ALBERT | -| AlbertForQuestionAnswering | | ✅ | ❎ | [TFAlbertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForQuestionAnswering) | -| AlbertForSequenceClassification | | ✅ | ❎ | [TFAlbertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForSequenceClassification) | -| AlbertForTokenClassification | | ✅ | ❎ | [TFAlbertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForTokenClassification) | -| Automatic Speech Recognition (Wav2Vec2ForCTC) | | ❎ | ❎ | [TFWav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.TFWav2Vec2ForCTC) | -| BartForZeroShotClassification | | ✅ | ❎ | [TFBartForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/bart#transformers.TFBartForSequenceClassification) | -| BartTransformer | | ✅ | ❎ | [TFBartForConditionalGeneration](https://huggingface.co/docs/transformers/model_doc/bart#transformers.TFBartForConditionalGeneration) | -| BertEmbeddings | ✅ | ✅ | ✅ | BERT - Small BERT - ELECTRA | -| BertForQuestionAnswering | | ✅ | ✅ | [TFBertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForQuestionAnswering) | -| BertForSequenceClassification | | ✅ | ✅ | [TFBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForSequenceClassification) | -| BertForTokenClassification | | ✅ | ✅ | [TFBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForTokenClassification) | -| BertForZeroShotClassification | | ✅ | ❎ | [TFBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForSequenceClassification) | -| BertSentenceEmbeddings | ✅ | ✅ | ❎ | BERT - Small BERT - ELECTRA | -| CamemBertEmbeddings | | ✅ | ✅ | CamemBERT | -| CamemBertForQuestionAnswering | | ✅ | ❎ | [TFCamembertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.TFCamembertForQuestionAnswering) | -| CamemBertForSequenceClassification | | ✅ | ❎ | [TFCamemBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.TFCamembertForSequenceClassification) | -| CamemBertForTokenClassification | | ✅ | ❎ | [TFCamemBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.TFCamembertForTokenClassification) | -| ConvNextForImageClassification | | ❎ | ❎ | [TFConvNextForImageClassification](https://huggingface.co/docs/transformers/model_doc/convnext#transformers.TFConvNextForImageClassification) | -| DeBertaEmbeddings | | ✅ | ✅ | DeBERTa-v2 - DeBERTa-v3 | -| DeBertaForQuestionAnswering | | ✅ | ❎ | [TFDebertaV2ForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/deberta-v2#transformers.TFDebertaV2ForQuestionAnswering) | -| DeBertaForSequenceClassification | | ✅ | ❎ | [TFDebertaV2ForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/deberta-v2#transformers.TFDebertaV2ForSequenceClassification) | -| DeBertaForTokenClassification | | ✅ | ❎ | [TFDebertaV2ForTokenClassification](https://huggingface.co/docs/transformers/model_doc/deberta-v2#transformers.TFDebertaV2ForTokenClassification) | -| DistilBertEmbeddings | | ✅ | ✅ | DistilBERT | -| DistilBertForQuestionAnswering | | ✅ | ✅ | [TFDistilBertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForQuestionAnswering) | -| DistilBertForSequenceClassification | | ✅ | ✅ | [TFDistilBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForSequenceClassification) | -| DistilBertForTokenClassification | | ✅ | ✅ | [TFDistilBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForTokenClassification) | -| DistilBertForZeroShotClassification | | ✅ | ❎ | [TFDistilBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForSequenceClassification) | -| E5Embeddings | | ✅ | ✅ | [SentenceTransformer](https://github.com/UKPLab/sentence-transformers) | -| ElmoEmbeddings | ❎ | | ❌ | | -| HubertForCTC | | ❎ | ❎ | [TFHubertForCTC](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.TFHubertForCTC) | -| InstructorEmbeddings | | ✅ | ❎ | INSTRUCTOR | -| LongformerEmbeddings | | ✅ | ❌ | Longformer | -| LongformerForQuestionAnswering | | ✅ | ❎ | [TFLongformerForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/longformer#transformers.TFLongformerForQuestionAnswering) | -| LongformerForSequenceClassification | | ✅ | ❎ | [TFLongformerForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/longformer#transformers.TFLongformerForSequenceClassification) | -| LongformerForTokenClassification | | ✅ | ❎ | [TFLongformerForTokenClassification](https://huggingface.co/docs/transformers/model_doc/longformer#transformers.TFLongformerForTokenClassification) | -| MarianTransformer | | ❌ | ❎ | | -| MPNetEmbeddings | | ✅ | ✅ | [SentenceTransformer](https://github.com/UKPLab/sentence-transformers) | -| OpenAI GPT2 | | ❌ | ❎ | | -| RoBertaEmbeddings | | ✅ | ✅ | RoBERTa - DistilRoBERTa | -| RoBertaForQuestionAnswering | | ✅ | ❎ | [TFRobertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForQuestionAnswering) | -| RoBertaForSequenceClassification | | ✅ | ❎ | [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification) | -| RoBertaForTokenClassification | | ✅ | ❎ | [TFRobertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForTokenClassification) | -| RoBertaForZeroShotClassification | | ✅ | ❎ | [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification) | -| RoBertaSentenceEmbeddings | | ✅ | ✅ | RoBERTa - DistilRoBERTa | -| SwinForImageClassification | | ❎ | ❎ | [TFSwinForImageClassification](https://huggingface.co/docs/transformers/model_doc/swin#transformers.TFSwinForImageClassification) | -| T5Transformer | | ❌ | ❎ | | -| TapasForQuestionAnswering | | ❎ | ❎ | [TFTapasForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/tapas#transformers.TFTapasForQuestionAnswering) | -| UniversalSentenceEncoder | ❎ | | ❌ | | -| VisionEncoderDecoderForImageCaptioning | | ✅ | ❎ | [VisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder#vision-encoder-decoder-models) | -| ViTForImageClassification | ❌ | ✅ | ❎ | [TFViTForImageClassification](https://huggingface.co/docs/transformers/model_doc/vit#transformers.TFViTForImageClassification) | -| WhisperForCTC | | ✅ | ✅ | [WhisperForConditionalGeneration](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/whisper#transformers.WhisperForConditionalGeneration) | -| XlmRoBertaEmbeddings | | ✅ | ✅ | XLM-RoBERTa | -| XlmRoBertaForQuestionAnswering | | ✅ | ❎ | [TFXLMRobertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForQuestionAnswering) | -| XlmRoBertaForSequenceClassification | | ✅ | ❎ | [TFXLMRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification) | -| XlmRoBertaForTokenClassification | | ✅ | ❎ | [TFXLMRobertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/xlmroberta#transformers.TFXLMRobertaForTokenClassification) | -| XlmRoBertaForZeroShotClassification | | ✅ | ❎ | [TFXLMRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification) | -| XlmRoBertaSentenceEmbeddings | | ✅ | ❎ | [SentenceTransformer](https://github.com/UKPLab/sentence-transformers) | -| XlnetEmbeddings | | ✅ | ❌ | XLNet | -| XlnetForSequenceClassification | | ✅ | ❎ | [TFXLNetForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlnet#transformers.TFXLNetForSequenceClassification) | -| XlnetForTokenClassification | | ✅ | ❎ | [TFXLNetForTokenClassificationet](https://huggingface.co/docs/transformers/model_doc/xlnet#transformers.TFXLNetForTokenClassificationet) | -| ZeroShotNerModel | | ✅ | ❎ | [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification) | +| Spark NLP | TF Hub | HuggingFace | ONNX | Model Architecture | +| :-------------------------------------------- | :----- | :---------- | :--- | :--------------------------------------------------------------------------------------------------------------------------------------------------------- | +| AlbertEmbeddings | ✅ | ✅ | ✅ | ALBERT | +| AlbertForQuestionAnswering | | ✅ | ❎ | [TFAlbertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForQuestionAnswering) | +| AlbertForSequenceClassification | | ✅ | ❎ | [TFAlbertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForSequenceClassification) | +| AlbertForTokenClassification | | ✅ | ❎ | [TFAlbertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/albert#transformers.TFAlbertForTokenClassification) | +| Automatic Speech Recognition (Wav2Vec2ForCTC) | | ❎ | ❎ | [TFWav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.TFWav2Vec2ForCTC) | +| BartForZeroShotClassification | | ✅ | ❎ | [TFBartForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/bart#transformers.TFBartForSequenceClassification) | +| BartTransformer | | ✅ | ❎ | [TFBartForConditionalGeneration](https://huggingface.co/docs/transformers/model_doc/bart#transformers.TFBartForConditionalGeneration) | +| BertEmbeddings | ✅ | ✅ | ✅ | BERT - Small BERT - ELECTRA | +| BertForQuestionAnswering | | ✅ | ✅ | [TFBertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForQuestionAnswering) | +| BertForSequenceClassification | | ✅ | ✅ | [TFBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForSequenceClassification) | +| BertForTokenClassification | | ✅ | ✅ | [TFBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForTokenClassification) | +| BertForZeroShotClassification | | ✅ | ❎ | [TFBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForSequenceClassification) | +| BertSentenceEmbeddings | ✅ | ✅ | ❎ | BERT - Small BERT - ELECTRA | +| CamemBertEmbeddings | | ✅ | ✅ | CamemBERT | +| CamemBertForQuestionAnswering | | ✅ | ❎ | [TFCamembertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.TFCamembertForQuestionAnswering) | +| CamemBertForSequenceClassification | | ✅ | ❎ | [TFCamemBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.TFCamembertForSequenceClassification) | +| CamemBertForTokenClassification | | ✅ | ❎ | [TFCamemBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.TFCamembertForTokenClassification) | +| CLIPForZeroShotClassification | | ✅ | ✅ | [CLIP](https://huggingface.co/docs/transformers/v4.36.1/model_doc/clip) | +| ConvNextForImageClassification | | ❎ | ❎ | [TFConvNextForImageClassification](https://huggingface.co/docs/transformers/model_doc/convnext#transformers.TFConvNextForImageClassification) | +| DeBertaEmbeddings | | ✅ | ✅ | DeBERTa-v2 - DeBERTa-v3 | +| DeBertaForQuestionAnswering | | ✅ | ❎ | [TFDebertaV2ForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/deberta-v2#transformers.TFDebertaV2ForQuestionAnswering) | +| DeBertaForSequenceClassification | | ✅ | ❎ | [TFDebertaV2ForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/deberta-v2#transformers.TFDebertaV2ForSequenceClassification) | +| DeBertaForTokenClassification | | ✅ | ❎ | [TFDebertaV2ForTokenClassification](https://huggingface.co/docs/transformers/model_doc/deberta-v2#transformers.TFDebertaV2ForTokenClassification) | +| DistilBertEmbeddings | | ✅ | ✅ | DistilBERT | +| DistilBertForQuestionAnswering | | ✅ | ✅ | [TFDistilBertForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForQuestionAnswering) | +| DistilBertForSequenceClassification | | ✅ | ✅ | [TFDistilBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForSequenceClassification) | +| DistilBertForTokenClassification | | ✅ | ✅ | [TFDistilBertForTokenClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForTokenClassification) | +| DistilBertForZeroShotClassification | | ✅ | ❎ | [TFDistilBertForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.TFDistilBertForSequenceClassification) | +| E5Embeddings | | ✅ | ✅ | [SentenceTransformer](https://github.com/UKPLab/sentence-transformers) | +| ElmoEmbeddings | ❎ | | ❌ | | +| HubertForCTC | | ❎ | ❎ | [TFHubertForCTC](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.TFHubertForCTC) | +| InstructorEmbeddings | | ✅ | ❎ | INSTRUCTOR | +| LongformerEmbeddings | | ✅ | ❌ | Longformer | +| LongformerForQuestionAnswering | | ✅ | ❎ | [TFLongformerForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/longformer#transformers.TFLongformerForQuestionAnswering) | +| LongformerForSequenceClassification | | ✅ | ❎ | [TFLongformerForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/longformer#transformers.TFLongformerForSequenceClassification) | +| LongformerForTokenClassification | | ✅ | ❎ | [TFLongformerForTokenClassification](https://huggingface.co/docs/transformers/model_doc/longformer#transformers.TFLongformerForTokenClassification) | +| MarianTransformer | | ❌ | ❎ | | +| MPNetEmbeddings | | ✅ | ✅ | [SentenceTransformer](https://github.com/UKPLab/sentence-transformers) | +| OpenAI GPT2 | | ❌ | ❎ | | +| RoBertaEmbeddings | | ✅ | ✅ | RoBERTa - DistilRoBERTa | +| RoBertaForQuestionAnswering | | ✅ | ❎ | [TFRobertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForQuestionAnswering) | +| RoBertaForSequenceClassification | | ✅ | ❎ | [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification) | +| RoBertaForTokenClassification | | ✅ | ❎ | [TFRobertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForTokenClassification) | +| RoBertaForZeroShotClassification | | ✅ | ❎ | [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification) | +| RoBertaSentenceEmbeddings | | ✅ | ✅ | RoBERTa - DistilRoBERTa | +| SwinForImageClassification | | ❎ | ❎ | [TFSwinForImageClassification](https://huggingface.co/docs/transformers/model_doc/swin#transformers.TFSwinForImageClassification) | +| T5Transformer | | ❌ | ❎ | | +| TapasForQuestionAnswering | | ❎ | ❎ | [TFTapasForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/tapas#transformers.TFTapasForQuestionAnswering) | +| UniversalSentenceEncoder | ❎ | | ❌ | | +| VisionEncoderDecoderForImageCaptioning | | ✅ | ❎ | [VisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder#vision-encoder-decoder-models) | +| ViTForImageClassification | ❌ | ✅ | ❎ | [TFViTForImageClassification](https://huggingface.co/docs/transformers/model_doc/vit#transformers.TFViTForImageClassification) | +| WhisperForCTC | | ✅ | ✅ | [WhisperForConditionalGeneration](https://huggingface.co/docs/transformers/v4.33.2/en/model_doc/whisper#transformers.WhisperForConditionalGeneration) | +| XlmRoBertaEmbeddings | | ✅ | ✅ | XLM-RoBERTa | +| XlmRoBertaForQuestionAnswering | | ✅ | ❎ | [TFXLMRobertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForQuestionAnswering) | +| XlmRoBertaForSequenceClassification | | ✅ | ❎ | [TFXLMRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification) | +| XlmRoBertaForTokenClassification | | ✅ | ❎ | [TFXLMRobertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/xlmroberta#transformers.TFXLMRobertaForTokenClassification) | +| XlmRoBertaForZeroShotClassification | | ✅ | ❎ | [TFXLMRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification) | +| XlmRoBertaSentenceEmbeddings | | ✅ | ❎ | [SentenceTransformer](https://github.com/UKPLab/sentence-transformers) | +| XlnetEmbeddings | | ✅ | ❌ | XLNet | +| XlnetForSequenceClassification | | ✅ | ❎ | [TFXLNetForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlnet#transformers.TFXLNetForSequenceClassification) | +| XlnetForTokenClassification | | ✅ | ❎ | [TFXLNetForTokenClassificationet](https://huggingface.co/docs/transformers/model_doc/xlnet#transformers.TFXLNetForTokenClassificationet) | +| ZeroShotNerModel | | ✅ | ❎ | [TFRobertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.TFRobertaForSequenceClassification) |
@@ -117,31 +118,39 @@ We have extended support for `HuggingFace` 🤗 and `TF Hub` exported models s #### HuggingFace, Optimum, PyTorch, and ONNX Runtime to Spark NLP (ONNX) -| Spark NLP | Notebooks | Colab | -|:------------------------------- |:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| AlbertForQuestionAnswering | [HuggingFace ONNX in Spark NLP AlbertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb) | -| AlbertForSequenceClassification | [HuggingFace ONNX in Spark NLP AlbertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForSequenceClassification.ipynb) | -| AlbertForTokenClassification | [HuggingFace ONNX in Spark NLP AlbertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb) | -| BertForQuestionAnswering | [HuggingFace ONNX in Spark NLP BertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb) | -| BertForSequenceClassification | [HuggingFace ONNX in Spark NLP BertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb) | -| BertForTokenClassification | [HuggingFace ONNX in Spark NLP BertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb) | -| DistilBertForQuestionAnswering | [HuggingFace ONNX in Spark NLP DistilBertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBertForQuestionAnswering.ipynb) | +| Spark NLP | Notebooks | Colab | +| :---------------------------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| AlbertForQuestionAnswering | [HuggingFace ONNX in Spark NLP AlbertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb) | +| AlbertForSequenceClassification | [HuggingFace ONNX in Spark NLP AlbertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForSequenceClassification.ipynb) | +| AlbertForTokenClassification | [HuggingFace ONNX in Spark NLP AlbertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb) | +| BertEmbeddings | [HuggingFace ONNX in Spark NLP BERT](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb) | +| BertForQuestionAnswering | [HuggingFace ONNX in Spark NLP BertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb) | +| BertForSequenceClassification | [HuggingFace ONNX in Spark NLP BertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForSequenceClassification.ipynb) | +| BertForTokenClassification | [HuggingFace ONNX in Spark NLP BertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForTokenClassification.ipynb) | +| BertSentenceEmbeddings | [HuggingFace ONNX in Spark NLP BertSentenceEmbeddings](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertSentenceEmbeddings.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertSentenceEmbeddings.ipynb) | +| CLIPForZeroShotClassification | [HuggingFace ONNX in Spark NLP CLIP](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb) | +| DeBertaEmbeddings | [HuggingFace ONNX in Spark NLP DeBERTa](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb) | +| DistilBertEmbeddings | [HuggingFace ONNX in Spark NLP DistilBERT](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb) | +| DistilBertForQuestionAnswering | [HuggingFace ONNX in Spark NLP DistilBertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBertForQuestionAnswering.ipynb) | | DistilBertForSequenceClassification | [HuggingFace ONNX in Spark NLP DistilBertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBertForSequenceClassification.ipynb) | | DistilBertForTokenClassification | [HuggingFace ONNX in Spark NLP DistilBertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBertForTokenClassification.ipynb) | -| BertEmbeddings | [HuggingFace ONNX in Spark NLP BERT](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_BERT.ipynb) | -| DeBertaEmbeddings | [HuggingFace ONNX in Spark NLP DeBERTa](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DeBERTa.ipynb) | -| DistilBertEmbeddings | [HuggingFace ONNX in Spark NLP DistilBERT](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_DistilBERT.ipynb) | -| E5Embeddings | [HuggingFace ONNX in Spark NLP E5](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb) | -| MPNet | [HuggingFace ONNX in Spark NLP MPNet](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb) | -| RoBertaEmbeddings | [HuggingFace ONNX in Spark NLP RoBERTa](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb) | -| WhisperForCTC | [HuggingFace ONNX in Spark NLP MPNet](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb) | +| E5Embeddings | [HuggingFace ONNX in Spark NLP E5](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb) | +| MarianTransformer | [HuggingFace ONNX in Spark NLP Marian](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb) | +| MPNet | [HuggingFace ONNX in Spark NLP MPNet](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb) | +| RoBertaEmbeddings | [HuggingFace ONNX in Spark NLP RoBERTa](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBERTa.ipynb) | +| RobertaForQuestionAnswering | [HuggingFace ONNX in Spark NLP RoBertaForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForQuestionAnswering.ipynb) | +| RoBertaForSequenceClassification | [HuggingFace ONNX in Spark NLP RoBertaForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForSequenceClassification.ipynb) | +| RoBertaForTokenClassification | [HuggingFace ONNX in Spark NLP RoBertaForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_RoBertaForTokenClassification.ipynb) | +| T5Transformer | [HuggingFace ONNX in Spark NLP T5](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb) | +| WhisperForCTC | [HuggingFace ONNX in Spark NLP MPNet](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Whisper.ipynb) | +| XlmRoBertaSentenceEmbeddings | [HuggingFace ONNX in Spark NLP XlmRoBertaSentenceEmbeddings](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaSentenceEmbeddings.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaSentenceEmbeddings.ipynb) |
#### HuggingFace to Spark NLP (TensorFlow) | Spark NLP | Notebooks | Colab | -|:------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| :---------------------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | AlbertEmbeddings | [HuggingFace in Spark NLP - ALBERT](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ALBERT.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ALBERT.ipynb) | | AlbertForQuestionAnswering | [HuggingFace in Spark NLP - AlbertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForQuestionAnswering.ipynb) | | AlbertForSequenceClassification | [HuggingFace in Spark NLP - AlbertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20AlbertForSequenceClassification.ipynb) | @@ -152,6 +161,7 @@ We have extended support for `HuggingFace` 🤗 and `TF Hub` exported models s | BertForTokenClassification | [HuggingFace in Spark NLP - BertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForTokenClassification.ipynb) | | BertForZeroShotClassification | [HuggingFace in Spark NLP - BertForZeroShotClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForZeroShotClassification.ipynb) | | BertSentenceEmbeddings | [HuggingFace in Spark NLP - BERT Sentence](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb) | +| BERTSentenceEmbeddings | [HuggingFace in Spark NLP - BERT Sentence](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BERT%20Sentence.ipynb) | | CamemBertEmbeddings | [HuggingFace in Spark NLP - CamemBERT](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBERT.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBERT.ipynb) | | CamemBertForQuestionAnswering | [HuggingFace in Spark NLP - CamemBertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForQuestionAnswering.ipynb) | | CamemBertForSequenceClassification | [HuggingFace in Spark NLP - CamemBertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20CamemBertForSequenceClassification.ipynb) | @@ -163,7 +173,8 @@ We have extended support for `HuggingFace` 🤗 and `TF Hub` exported models s | DistilBertForQuestionAnswering | [HuggingFace in Spark NLP - DistilBertForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForQuestionAnswering.ipynb) | | DistilBertForSequenceClassification | [HuggingFace in Spark NLP - DistilBertForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForSequenceClassification.ipynb) | | DistilBertForTokenClassification | [HuggingFace in Spark NLP - DistilBertForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForTokenClassification.ipynb) | -| DistilBertForZeroShotClassification | [HuggingFace in Spark NLP - DistilBertForZeroShotClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForZeroClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForZeroShotClassification.ipynb) | +| DistilBertForZeroClassification | [HuggingFace in Spark NLP - DistilBertForZeroClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForZeroClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForZeroClassification.ipynb) | +| DistilBertForZeroShotClassification | [HuggingFace in Spark NLP - DistilBertForZeroShotClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForZeroShotClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20DistilBertForZeroShotClassification.ipynb) | | LongformerEmbeddings | [HuggingFace in Spark NLP - Longformer](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20Longformer.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20Longformer.ipynb) | | LongformerForQuestionAnswering | [HuggingFace in Spark NLP - LongformerForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20LongformerForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20LongformerForQuestionAnswering.ipynb) | | LongformerForSequenceClassification | [HuggingFace in Spark NLP - LongformerForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20LongformerForSequenceClassification.ipynb) | @@ -174,13 +185,14 @@ We have extended support for `HuggingFace` 🤗 and `TF Hub` exported models s | RoBertaForZeroShotClassification | [HuggingFace in Spark NLP - RoBertaForZeroShotClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForZeroShotClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForZeroShotClassification.ipynb) | | SwinForImageClassification | [HuggingFace in Spark NLP - SwinForImageClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20SwinForImageClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20SwinForImageClassification.ipynb) | | ViTForImageClassification | [HuggingFace in Spark NLP - ViTForImageClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ViTForImageClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20ViTForImageClassification.ipynb) | -| WhisperForCTC | [HuggingFace in Spark NLP - WhisperForCTC](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForZeroShotClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20WhisperForCTC.ipynb) | +| WhisperForCTC | [HuggingFace in Spark NLP - WhisperForCTC](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20RoBertaForZeroShotClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20WhisperForCTC.ipynb) | | XlmRoBertaEmbeddings | [HuggingFace in Spark NLP - XLM-RoBERTa](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLM-RoBERTa.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLM-RoBERTa.ipynb) | | XlmRobertaForQuestionAnswering | [HuggingFace in Spark NLP - XlmRobertaForQuestionAnswering](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRobertaForQuestionAnswering.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRobertaForQuestionAnswering.ipynb) | | XlmRoBertaForSequenceClassification | [HuggingFace in Spark NLP - XlmRoBertaForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRoBertaForSequenceClassification.ipynb) | | XlmRoBertaForTokenClassification | [HuggingFace in Spark NLP - XlmRoBertaForTokenClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRoBertaForTokenClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlmRoBertaForTokenClassification.ipynb) | | XlnetEmbeddings | [HuggingFace in Spark NLP - XLNet](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLNet.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XLNet.ipynb) | | XlnetForSequenceClassification | [HuggingFace in Spark NLP - XlnetForSequenceClassification](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20BertForSequenceClassification.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20XlnetForSequenceClassification.ipynb) | +| T5Transformer | [HuggingFace in Spark NLP - T5](https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20T5.ipynb) |
diff --git a/examples/python/transformers/HuggingFace in Spark NLP - T5.ipynb b/examples/python/transformers/HuggingFace in Spark NLP - T5.ipynb new file mode 100644 index 00000000000000..e4282b777c3ea0 --- /dev/null +++ b/examples/python/transformers/HuggingFace in Spark NLP - T5.ipynb @@ -0,0 +1,2690 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/HuggingFace%20in%20Spark%20NLP%20-%20T5.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import T5 models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- You can import T5 models via `T5Model`. These models are usually under `Text2Text Generation` category and have `T5` in their labels\n", + "- This is a very computationally expensive module especially on larger sequence. The use of an accelerator such as GPU is recommended.\n", + "- Reference: [T5Model](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Model)\n", + "- Some [example models](https://huggingface.co/models?other=T5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model\n", + "\n", + "- Let's install `transformers` package and it's dependencies.\n", + "- We lock `tensorflow` to version `2.8`\n", + "- We lock `transformers` on version `4.35.2`. This doesn't mean it won't work with the future releases\n", + "- We will also need `sentencepiece` for tokenization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m497.6/497.6 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.6/42.6 kB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.8/5.8 MB\u001b[0m \u001b[31m86.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m462.5/462.5 kB\u001b[0m \u001b[31m40.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m63.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m86.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m56.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.35.2 sentencepiece tensorflow==2.8" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n", + "- We'll use [google/flan-t5-base](https://huggingface.co/google/flan-t5-base) model from HuggingFace as an example\n", + "- In addition to `T5Model` we also need to save the tokenizer. This is the same for every model, these are assets needed for tokenization inside Spark NLP.\n", + "0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import transformers\n", + "# Model name, either HF (e.g. \"google/flan-t5-base\") or a local path\n", + "MODEL_NAME = \"google/flan-t5-base\"\n", + "\n", + "# Path to store the exported models\n", + "EXPORT_PATH = f\"exported/{MODEL_NAME}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exporting this model involves several steps. We need to\n", + "\n", + "1. separate the encoder and decoder and their cache tensors\n", + "3. create a wrapper to create the right model signatures\n", + "4. export the preprocessor to the `assets` folder\n", + "\n", + "Don't worry if this next step seems overwhelming. Once you run the next cell everything should be exported to the right place!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2acf1f98563443e9a84b5cfd51c54ace", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0%| | 0.00/1.40k [00:00. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "data": { + "text/plain": [ + "('exported/google/flan-t5-base/assets/tokenizer_config.json',\n", + " 'exported/google/flan-t5-base/assets/special_tokens_map.json',\n", + " 'exported/google/flan-t5-base/assets/spiece.model',\n", + " 'exported/google/flan-t5-base/assets/added_tokens.json')" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import T5Tokenizer\n", + "\n", + "# Create assets\n", + "!mkdir -p {EXPORT_PATH}/assets\n", + "\n", + "tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)\n", + "tokenizer.save_pretrained(f\"{EXPORT_PATH}/assets/\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 20836\n", + "drwxr-xr-x 2 root root 4096 Dec 9 16:58 assets\n", + "-rw-r--r-- 1 root root 21326986 Dec 9 16:56 saved_model.pb\n", + "drwxr-xr-x 2 root root 4096 Dec 9 16:56 variables\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 808\n", + "-rw-r--r-- 1 root root 2593 Dec 9 16:58 added_tokens.json\n", + "-rw-r--r-- 1 root root 2543 Dec 9 16:58 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 791656 Dec 9 16:58 spiece.model\n", + "-rw-r--r-- 1 root root 20789 Dec 9 16:58 tokenizer_config.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save T5 in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.2.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m548.5/548.5 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m18.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `T5Transformer` which allows us to load the model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `T5Transformer` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "T5 = T5Transformer.loadSavedModel(EXPORT_PATH, spark)\\\n", + " .setUseCache(True) \\\n", + " .setTask(\"summarize:\") \\\n", + " .setMaxOutputLength(200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "T5.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your T5 model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 988436\n", + "drwxr-xr-x 3 root root 4096 Dec 9 17:06 fields\n", + "drwxr-xr-x 2 root root 4096 Dec 9 17:06 metadata\n", + "-rw-r--r-- 1 root root 791656 Dec 9 17:08 t5_spp\n", + "-rw-r--r-- 1 root root 1011349768 Dec 9 17:08 t5_tensorflow\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny T5 model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------------+\n", + "|result |\n", + "+-----------------------------------------------------------------------------------------------------------+\n", + "|[We introduce a unified framework that converts text-to-text language problems into a text-to-text format.]|\n", + "+-----------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "test_data = spark.createDataFrame([\n", + " [\"Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a \" +\n", + " \"downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness\" +\n", + " \" of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this \" +\n", + " \"paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework \" +\n", + " \"that converts all text-based language problems into a text-to-text format. Our systematic study compares \" +\n", + " \"pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens \" +\n", + " \"of language understanding tasks. By combining the insights from our exploration with scale and our new \" +\n", + " \"Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering \" +\n", + " \"summarization, question answering, text classification, and more. To facilitate future work on transfer \" +\n", + " \"learning for NLP, we release our data set, pre-trained models, and code.\"]\n", + "]).toDF(\"text\")\n", + "\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "T5 = T5Transformer.load(f\"{MODEL_NAME}_spark_nlp\") \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"summary\")\n", + "\n", + "pipeline = Pipeline().setStages([document_assembler, T5])\n", + "\n", + "result = pipeline.fit(test_data).transform(test_data)\n", + "result.select(\"summary.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of T5 models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "062085846b394124b9f0d51a9a8b0ddc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f8c5a31dabd74077a3b19550b4753f7d", + "max": 990345061, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9635a709448841baa1215db963e22451", + "value": 990345061 + } + }, + "06c26a4680d54622a7f3fd152bb1bf55": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "16575639f71240c0999926a05131bab0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "1757784b06544cb2bb2313b1db62e96a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "20d2c54b571f446990ea5a82b33fb860": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_df0f50d605be4f1a85452fbb835b6b05", + "max": 2201, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_43c4fc857b844ac7bad7b17a98509545", + "value": 2201 + } + }, + "23e43e84ea504257a941eb53104ceeb4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "23e4bd5df3c4443694c817207713479d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e5b2155532714fcaa3c6cb7634124bbb", + "placeholder": "​", + "style": "IPY_MODEL_b5339d1d95cd444ea969bae0b6ebe8fd", + "value": "spiece.model: 100%" + } + }, + "2acf1f98563443e9a84b5cfd51c54ace": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e880389770c249a9b9646c6bc444c2a6", + "IPY_MODEL_a86fa2b93bd149c586e0546160e48dd6", + "IPY_MODEL_63ceba7e338d4b91b8cbcca6a9d2e7a5" + ], + "layout": "IPY_MODEL_32d3d4630c0b47c79232156c292f2637" + } + }, + "2b2eab81482f4a49b9a33a10252f8409": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_23e4bd5df3c4443694c817207713479d", + "IPY_MODEL_3fe5947afa004d4a8e6a8e2f225c8c77", + "IPY_MODEL_74e2eef5069b45ff9363085b69996940" + ], + "layout": "IPY_MODEL_f7c588d3aff541c9b58840b4df25aec7" + } + }, + "32d3d4630c0b47c79232156c292f2637": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "38dd6765a7384afe883178c8c39238ad": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ed059ea77fd34669a03efbfa4b4f1ed4", + "max": 2424064, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_464170acb1744305a2b2eb8d7d2982a2", + "value": 2424064 + } + }, + "399ea14bf4504ff59f40d5b8fd2fe1b4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dfa1895ed1b245fd80b9a806c9e25607", + "placeholder": "​", + "style": "IPY_MODEL_3e959bfd2db443348874d9980bf4b5ee", + "value": " 2.20k/2.20k [00:00<00:00, 90.5kB/s]" + } + }, + "3c64aa1f8c7b4e0c82f258024e7ded0e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3cee300aeb5f46ef92f985de4c4d8c5b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9df506d220bc4f088e695d6e6e8dd410", + "IPY_MODEL_38dd6765a7384afe883178c8c39238ad", + "IPY_MODEL_ecec4cbe3e4a4e9f93b6d715a4481fe4" + ], + "layout": "IPY_MODEL_ea60efd99eed4a1faeba698713c826a1" + } + }, + "3e959bfd2db443348874d9980bf4b5ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3f86d24626694134b1e3b6ecdca1873d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3fe5947afa004d4a8e6a8e2f225c8c77": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_75993e7de9b44c56a861a27c346bd083", + "max": 791656, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3c64aa1f8c7b4e0c82f258024e7ded0e", + "value": 791656 + } + }, + "4011ce0814034929a37f6753f704934a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "41bd4f85bc2e4cdb9aab5ed282e30dc3": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "42ea16f73eba4daabd126e1b9b0aa8c6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "43c4fc857b844ac7bad7b17a98509545": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4472db5868914ecf9da013171c546a8a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "464170acb1744305a2b2eb8d7d2982a2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4ade2de4dfaf4a9bae09483537a83436": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a50f1cdbd05842b1942ee3bb70ac69bf", + "IPY_MODEL_91dc1d6be97e4b2193512741eaf036c1", + "IPY_MODEL_e6cf99698661444e823619307d64a0eb" + ], + "layout": "IPY_MODEL_b6cc229b46ba424c9aef22a60488ffa2" + } + }, + "4f964217539546fd96c1d983d81199e3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52e5a877a6c840f0bd4c0f93cd32dd8b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_6b7368ee67404dea83690bbc8b80f16e", + "IPY_MODEL_20d2c54b571f446990ea5a82b33fb860", + "IPY_MODEL_399ea14bf4504ff59f40d5b8fd2fe1b4" + ], + "layout": "IPY_MODEL_4011ce0814034929a37f6753f704934a" + } + }, + "63ceba7e338d4b91b8cbcca6a9d2e7a5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bfe6219541014a449261014e869b518c", + "placeholder": "​", + "style": "IPY_MODEL_c32322c84f7b4457b3a4d0e095bf80ac", + "value": " 1.40k/1.40k [00:00<00:00, 57.4kB/s]" + } + }, + "647ad74b3677442cac8cf44a430313ac": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6621c21362c44179a4b56e6e1b7efc0f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6b7368ee67404dea83690bbc8b80f16e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fcaa9423baed457caec557bc4ea0b0ee", + "placeholder": "​", + "style": "IPY_MODEL_41bd4f85bc2e4cdb9aab5ed282e30dc3", + "value": "special_tokens_map.json: 100%" + } + }, + "74e2eef5069b45ff9363085b69996940": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bf147af5cd6b4b7fbe5b2fbdf05a69ce", + "placeholder": "​", + "style": "IPY_MODEL_4472db5868914ecf9da013171c546a8a", + "value": " 792k/792k [00:00<00:00, 3.69MB/s]" + } + }, + "75993e7de9b44c56a861a27c346bd083": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ea52b50a86743f5aa38c3542fb3bcaf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "81a8920242114b888e420ab323c9046c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_be1fe34ce8fc4500a85a8dd787db92db", + "IPY_MODEL_062085846b394124b9f0d51a9a8b0ddc", + "IPY_MODEL_9b899b572d9c4cfd85577d367ba69361" + ], + "layout": "IPY_MODEL_6621c21362c44179a4b56e6e1b7efc0f" + } + }, + "91dc1d6be97e4b2193512741eaf036c1": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_06c26a4680d54622a7f3fd152bb1bf55", + "max": 2537, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_96d408142b2047c8aab24d98dac054e0", + "value": 2537 + } + }, + "9635a709448841baa1215db963e22451": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "96d408142b2047c8aab24d98dac054e0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9b899b572d9c4cfd85577d367ba69361": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_23e43e84ea504257a941eb53104ceeb4", + "placeholder": "​", + "style": "IPY_MODEL_16575639f71240c0999926a05131bab0", + "value": " 990M/990M [00:13<00:00, 66.7MB/s]" + } + }, + "9df506d220bc4f088e695d6e6e8dd410": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee3265cc51e04c92bfb6c610f6aabacf", + "placeholder": "​", + "style": "IPY_MODEL_7ea52b50a86743f5aa38c3542fb3bcaf", + "value": "tokenizer.json: 100%" + } + }, + "a50f1cdbd05842b1942ee3bb70ac69bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e47dc255d5de4a639e2ff45d0b143d4b", + "placeholder": "​", + "style": "IPY_MODEL_e39e2fc6b5454939bc2031768327531b", + "value": "tokenizer_config.json: 100%" + } + }, + "a86fa2b93bd149c586e0546160e48dd6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_647ad74b3677442cac8cf44a430313ac", + "max": 1404, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_42ea16f73eba4daabd126e1b9b0aa8c6", + "value": 1404 + } + }, + "b5339d1d95cd444ea969bae0b6ebe8fd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b6cc229b46ba424c9aef22a60488ffa2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "be1fe34ce8fc4500a85a8dd787db92db": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1757784b06544cb2bb2313b1db62e96a", + "placeholder": "​", + "style": "IPY_MODEL_ec57c208211448a0b8f51befc71e7b65", + "value": "model.safetensors: 100%" + } + }, + "bf147af5cd6b4b7fbe5b2fbdf05a69ce": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bfe6219541014a449261014e869b518c": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c08d526fd2ff41958f13a089e9e4738e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c32322c84f7b4457b3a4d0e095bf80ac": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "cd8ab4d0c2c548d6898fa1dca7007fdb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d4c8955f58574c4b80fe01ff6863fefd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d93013daaa964de68654736ae33e326d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "df0f50d605be4f1a85452fbb835b6b05": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dfa1895ed1b245fd80b9a806c9e25607": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e39e2fc6b5454939bc2031768327531b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e47dc255d5de4a639e2ff45d0b143d4b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e5b2155532714fcaa3c6cb7634124bbb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e6cf99698661444e823619307d64a0eb": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c08d526fd2ff41958f13a089e9e4738e", + "placeholder": "​", + "style": "IPY_MODEL_cd8ab4d0c2c548d6898fa1dca7007fdb", + "value": " 2.54k/2.54k [00:00<00:00, 33.5kB/s]" + } + }, + "e880389770c249a9b9646c6bc444c2a6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f964217539546fd96c1d983d81199e3", + "placeholder": "​", + "style": "IPY_MODEL_d4c8955f58574c4b80fe01ff6863fefd", + "value": "config.json: 100%" + } + }, + "ea60efd99eed4a1faeba698713c826a1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ec57c208211448a0b8f51befc71e7b65": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ecec4cbe3e4a4e9f93b6d715a4481fe4": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d93013daaa964de68654736ae33e326d", + "placeholder": "​", + "style": "IPY_MODEL_3f86d24626694134b1e3b6ecdca1873d", + "value": " 2.42M/2.42M [00:00<00:00, 6.70MB/s]" + } + }, + "ed059ea77fd34669a03efbfa4b4f1ed4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee3265cc51e04c92bfb6c610f6aabacf": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f7c588d3aff541c9b58840b4df25aec7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f8c5a31dabd74077a3b19550b4753f7d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fcaa9423baed457caec557bc4ea0b0ee": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb new file mode 100644 index 00000000000000..44fbdde06405b7 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb @@ -0,0 +1,617 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_Marian.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX Marian models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `MarianTransformer` is only available since in `Spark NLP 5.2.0` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import Marian models via `MarianMTModel`. These models are usually under `Text2Text Generation` category and have `marian` in their labels\n", + "- Reference: [MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)\n", + "- Some [example models](https://huggingface.co/models?other=marian)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model\n", + "\n", + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.35.2`. This doesn't mean it won't work with the future releases\n", + "- We will also need `sentencepiece` for tokenization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m400.9/400.9 kB\u001b[0m \u001b[31m889.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m14.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m22.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m22.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m45.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m33.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.14.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.35.2 optimum sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [Helsinki-NLP/opus-mt-en-bg](https://huggingface.co/Helsinki-NLP/opus-mt-en-bg) model from HuggingFace as an example\n", + "- In addition to `MarianMTModel` we also need to save the tokenizer. This is the same for every model, these are assets needed for tokenization inside Spark NLP.\n", + "- If we want to optimize the model, a GPU will be needed. Make sure to select the correct runtime.\n", + "0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import transformers\n", + "# Model name, either HF (e.g. \"Helsinki-NLP/opus-mt-en-bg\") or a local path\n", + "MODEL_NAME = \"Helsinki-NLP/opus-mt-en-bg\"\n", + "\n", + "\n", + "# Path to store the exported models\n", + "EXPORT_PATH = \"onnx_models/mt_en_bg_onnx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-09 15:36:35.997046: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2023-12-09 15:36:35.997138: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2023-12-09 15:36:35.997190: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2023-12-09 15:36:39.395061: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Framework not specified. Using pt to export to ONNX.\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/marian/tokenization_marian.py:197: UserWarning: Recommended: pip install sacremoses.\n", + " warnings.warn(\"Recommended: pip install sacremoses.\")\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/marian/tokenization_marian.py:197: UserWarning: Recommended: pip install sacremoses.\n", + " warnings.warn(\"Recommended: pip install sacremoses.\")\n", + "Using framework PyTorch: 2.1.0+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/marian/modeling_marian.py:213: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/marian/modeling_marian.py:220: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attention_mask.size() != (bsz, 1, tgt_len, src_len):\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/marian/modeling_marian.py:252: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):\n", + "Using framework PyTorch: 2.1.0+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:66: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if input_shape[-1] > 1 or self.sliding_window is not None:\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_attn_mask_utils.py:137: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if past_key_values_length > 0:\n", + "Using framework PyTorch: 2.1.0+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/marian/modeling_marian.py:175: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " and past_key_value[0].shape[2] == key_value_states.shape[1]\n", + "Post-processing the exported models...\n", + "Weight deduplication check in the ONNX export requires accelerate. Please install accelerate to run it.\n", + "The two models proto have different outputs (25 and 13 outputs). Constant outputs will be added to unify the two models outputs.\n", + "Adding a constant output for present.0.encoder.key of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.0.encoder.value of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.1.encoder.key of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.1.encoder.value of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.2.encoder.key of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.2.encoder.value of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.3.encoder.key of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.3.encoder.value of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.4.encoder.key of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.4.encoder.value of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.5.encoder.key of shape [0, 8, 1, 64] in model2.\n", + "Adding a constant output for present.5.encoder.value of shape [0, 8, 1, 64] in model2.\n", + "Validating ONNX model onnx_models/mt_en_bg_onnx/encoder_model.onnx...\n", + "\t-[✓] ONNX model output names match reference model (last_hidden_state)\n", + "\t- Validating ONNX Model output \"last_hidden_state\":\n", + "\t\t-[✓] (2, 16, 512) matches (2, 16, 512)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "Validating ONNX model onnx_models/mt_en_bg_onnx/decoder_model_merged.onnx...\n", + "2023-12-09 15:37:23.337914899 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_8_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.338106443 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.339141499 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_7_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.339176748 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.339264647 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_22_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.339280914 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_17_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.390090751 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_1_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.391676666 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_11_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.391777448 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.391852526 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_12_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.391877847 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_10_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:23.391932788 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_13_output_0'. It is not used by any node and should be removed from the model.\n", + "\t-[✓] ONNX model output names match reference model (present.0.decoder.value, present.0.decoder.key, present.1.encoder.value, present.4.encoder.key, present.1.encoder.key, present.0.encoder.key, present.4.decoder.key, logits, present.4.decoder.value, present.1.decoder.key, present.4.encoder.value, present.1.decoder.value, present.2.decoder.value, present.3.decoder.key, present.2.encoder.value, present.0.encoder.value, present.3.decoder.value, present.5.encoder.key, present.5.encoder.value, present.5.decoder.key, present.3.encoder.key, present.2.encoder.key, present.3.encoder.value, present.2.decoder.key, present.5.decoder.value)\n", + "\t- Validating ONNX Model output \"logits\":\n", + "\t\t-[✓] (2, 16, 61708) matches (2, 16, 61708)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.encoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.encoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.encoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.encoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.encoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.encoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.encoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.encoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.encoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.encoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.encoder.key\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.encoder.value\":\n", + "\t\t-[✓] (2, 8, 16, 64) matches (2, 8, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "Validating ONNX model onnx_models/mt_en_bg_onnx/decoder_model_merged.onnx...\n", + "2023-12-09 15:37:25.106554510 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_8_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.106739046 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.107498591 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_7_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.107528970 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Shape_4_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.107607752 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_22_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.107626402 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_17_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.138559519 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_1_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.139517684 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_11_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.139600846 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.139628016 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_12_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.139660269 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_10_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:37:25.139707770 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/model/decoder/Constant_13_output_0'. It is not used by any node and should be removed from the model.\n", + "\t-[✓] ONNX model output names match reference model (present.0.decoder.value, present.0.decoder.key, present.3.decoder.key, present.1.decoder.value, present.4.decoder.key, present.5.decoder.key, logits, present.3.decoder.value, present.2.decoder.value, present.2.decoder.key, present.4.decoder.value, present.1.decoder.key, present.5.decoder.value)\n", + "\t- Validating ONNX Model output \"logits\":\n", + "\t\t-[✓] (2, 1, 61708) matches (2, 1, 61708)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.key\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.value\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.key\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.value\":\n", + "\t\t-[✓] (2, 8, 17, 64) matches (2, 8, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "The ONNX export succeeded and the exported model was saved at: onnx_models/mt_en_bg_onnx\n" + ] + } + ], + "source": [ + "# Export the model to ONNX using optimum\n", + "\n", + "# Export with optimizations (uncomment next line)\n", + "# !optimum-cli export onnx --task text2text-generation-with-past --model {MODEL_NAME} --optimize O2 {EXPORT_PATH}\n", + "# IMPORTANT - there is a bug in onnxruntime which crashes it when trying to optimize a T5 small model (or any derivative of it)\n", + "# There are two ways to addess the problem:\n", + "# 1. Go to onnx_model_bert.py in the onnxruntime module (the full path depends on the module version),\n", + "# find the BertOnnxModel class and comment the following line in the constructor:\n", + "# assert (num_heads == 0 and hidden_size == 0) or (num_heads > 0 and hidden_size % num_heads == 0)\n", + "# 2. Disable optimization by removing '--optimize O2' (use line below).\n", + "\n", + "# Export without optimizations\n", + "!optimum-cli export onnx --task text2text-generation-with-past --model {MODEL_NAME} {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1231176\n", + "-rw-r--r-- 1 root root 1384 Dec 9 15:36 config.json\n", + "-rw-r--r-- 1 root root 355460094 Dec 9 15:37 decoder_model_merged.onnx\n", + "-rw-r--r-- 1 root root 355224613 Dec 9 15:37 decoder_model.onnx\n", + "-rw-r--r-- 1 root root 342573852 Dec 9 15:37 decoder_with_past_model.onnx\n", + "-rw-r--r-- 1 root root 203194157 Dec 9 15:36 encoder_model.onnx\n", + "-rw-r--r-- 1 root root 288 Dec 9 15:36 generation_config.json\n", + "-rw-r--r-- 1 root root 791438 Dec 9 15:36 source.spm\n", + "-rw-r--r-- 1 root root 74 Dec 9 15:36 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 999053 Dec 9 15:36 target.spm\n", + "-rw-r--r-- 1 root root 818 Dec 9 15:36 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2451253 Dec 9 15:36 vocab.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move the sentence piece models `*.spm` from the tokenizer to assets folder which Spark NLP will look for\n", + "- We also need to process `vocab.json` for the tokenizer vocabulary. The Spark NLP Annotator expects a `vocab.txt` with one word per line." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! mkdir -p {EXPORT_PATH}/assets\n", + "! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/*.spm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "output_json = json.load(open(f\"{EXPORT_PATH}/vocab.json\"))\n", + "\n", + "with open(f\"{EXPORT_PATH}/assets/vocab.txt\", \"w\") as f:\n", + " for key in output_json.keys():\n", + " print(key, file=f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2528\n", + "-rw-r--r-- 1 root root 791438 Dec 9 15:36 source.spm\n", + "-rw-r--r-- 1 root root 999053 Dec 9 15:36 target.spm\n", + "-rw-r--r-- 1 root root 792353 Dec 9 15:38 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save Marian in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.2.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m548.5/548.5 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m20.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `MarianTransformer` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `MarianTransformer` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "marian = MarianTransformer.loadSavedModel(EXPORT_PATH, spark)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "marian.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX Marian model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 547408\n", + "-rw-r--r-- 1 root root 355514472 Dec 9 15:40 decoder.onxx\n", + "-rw-r--r-- 1 root root 203225300 Dec 9 15:39 encoder.onxx\n", + "-rw-r--r-- 1 root root 791438 Dec 9 15:40 marian_spp_src\n", + "-rw-r--r-- 1 root root 999053 Dec 9 15:40 marian_spp_trg\n", + "drwxr-xr-x 2 root root 4096 Dec 9 15:39 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny Marian model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|result |\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[В началото на 2004 г., в началото на 2007 г., в началото на 2007 г., се провеждаше и конгрес на тема „Статии за развитие на земеделието“ и „]|\n", + "+----------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "test_data = spark.createDataFrame([\n", + " (1, \"Rome (Italian and Latin: Roma [ˈroːma] ⓘ) is the capital city of Italy. It is also the capital of the Lazio region, the centre of the Metropolitan City of Rome, and a special comune named Comune di Roma Capitale.s listed by UNESCO as a World Heritage Site.[17] The host city for the 1960 Summer Olympics, Rome is also the seat of several specialised agencies of the United Nations, such as the Food and Agriculture Organization (FAO), the World Food Programme (WFP) and the International Fund for Agricultural Development (IFAD). The city also hosts the Secretariat of the Parliamentary Assembly of the Union for the Mediterranean[18] (UfM) as well as the headquarters of many international businesses, such as Eni, Enel, TIM, Leonardo, and banks such as BNL. Numerous companies are based within Rome's EUR business district, such as the luxury fashion house Fendi located in the Palazzo della Civiltà Italiana. The presence of renowned international brands in the city has made Rome an important centre of fashion and design, and the Cinecittà Studios have been the set of many Academy Award–winning movies.\"),\n", + "]).toDF(\"id\", \"text\")\n", + "\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "marian = MarianTransformer.load(f\"{MODEL_NAME}_spark_nlp\") \\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"translation\")\\\n", + " .setMaxInputLength(512)\n", + "\n", + "pipeline = Pipeline().setStages([document_assembler, marian])\n", + "\n", + "result = pipeline.fit(test_data).transform(test_data)\n", + "result.select(\"translation.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of Marian models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb new file mode 100644 index 00000000000000..8a7518f56631df --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb @@ -0,0 +1,902 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_T5.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import ONNX T5 models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- ONNX support for the `T5Transformer` is only available since in `Spark NLP 5.2.0` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import T5 models via `T5Model`. These models are usually under `Text2Text Generation` category and have `T5` in their labels\n", + "- This is a very computationally expensive module especially on larger sequence. The use of an accelerator such as GPU is recommended.\n", + "- Reference: [T5Model](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Model)\n", + "- Some [example models](https://huggingface.co/models?other=T5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model\n", + "\n", + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.35.2`. This doesn't mean it won't work with the future releases\n", + "- We will also need `sentencepiece` for tokenization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m400.9/400.9 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m27.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m52.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.14.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.35.2 optimum sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [google/flan-t5-base](https://huggingface.co/google/flan-t5-base) model from HuggingFace as an example\n", + "- In addition to `T5Model` we also need to save the tokenizer. This is the same for every model, these are assets needed for tokenization inside Spark NLP.\n", + "- If we want to optimize the model, a GPU will be needed. Make sure to select the correct runtime.\n", + "0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import transformers\n", + "# Model name, either HF (e.g. \"google/flan-t5-base\") or a local path\n", + "MODEL_NAME = \"google/flan-t5-base\"\n", + "\n", + "\n", + "# Path to store the exported models\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-09 15:50:28.712604: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2023-12-09 15:50:28.712694: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2023-12-09 15:50:28.712744: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2023-12-09 15:50:31.175343: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "Framework not specified. Using pt to export to ONNX.\n", + "config.json: 100% 1.40k/1.40k [00:00<00:00, 5.73MB/s]\n", + "model.safetensors: 100% 990M/990M [00:12<00:00, 82.4MB/s]\n", + "generation_config.json: 100% 147/147 [00:00<00:00, 555kB/s]\n", + "tokenizer_config.json: 100% 2.54k/2.54k [00:00<00:00, 8.77MB/s]\n", + "spiece.model: 100% 792k/792k [00:00<00:00, 138MB/s]\n", + "tokenizer.json: 100% 2.42M/2.42M [00:00<00:00, 15.2MB/s]\n", + "special_tokens_map.json: 100% 2.20k/2.20k [00:00<00:00, 9.60MB/s]\n", + "Using the export variant default. Available variants are:\n", + " - default: The default ONNX variant.\n", + "Using framework PyTorch: 2.1.0+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "Using framework PyTorch: 2.1.0+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:873: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if causal_mask.shape[1] < attention_mask.shape[1]:\n", + "Using framework PyTorch: 2.1.0+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "/usr/local/lib/python3.10/dist-packages/transformers/models/t5/modeling_t5.py:508: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " elif past_key_value.shape[2] != key_value_states.shape[1]:\n", + "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n", + "In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode\n", + "Post-processing the exported models...\n", + "Weight deduplication check in the ONNX export requires accelerate. Please install accelerate to run it.\n", + "The two models proto have different outputs (49 and 25 outputs). Constant outputs will be added to unify the two models outputs.\n", + "Adding a constant output for present.0.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.0.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.1.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.1.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.2.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.2.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.3.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.3.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.4.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.4.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.5.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.5.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.6.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.6.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.7.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.7.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.8.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.8.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.9.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.9.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.10.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.10.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.11.encoder.key of shape [0, 12, 1, 64] in model2.\n", + "Adding a constant output for present.11.encoder.value of shape [0, 12, 1, 64] in model2.\n", + "Validating ONNX model onnx_models/google/flan-t5-base/encoder_model.onnx...\n", + "\t-[✓] ONNX model output names match reference model (last_hidden_state)\n", + "\t- Validating ONNX Model output \"last_hidden_state\":\n", + "\t\t-[✓] (2, 16, 768) matches (2, 16, 768)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "Validating ONNX model onnx_models/google/flan-t5-base/decoder_model_merged.onnx...\n", + "2023-12-09 15:51:48.208017351 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208083363 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208097809 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208108009 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208123335 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208132392 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208142039 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208152218 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208160006 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208180803 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208190583 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208198618 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208206693 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208214643 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208238111 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208248040 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208267097 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208274621 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208280709 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208287100 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208295365 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208305471 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208317459 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208324941 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208336444 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208347444 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208357823 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208366278 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208374195 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208399398 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208409515 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208421604 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208440000 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208450547 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208457592 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208477670 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.208494899 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/final_layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209574821 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209602649 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209614041 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209628454 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209654487 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209673964 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209681506 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209690866 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209699687 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209715509 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209722772 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209736676 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209751723 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209759121 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209769884 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209783392 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209811926 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209822486 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209829954 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209845010 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209857264 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209863876 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209871035 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209886358 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209895598 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209904553 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209912621 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209922981 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209939977 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209947278 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209966706 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209977370 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.209986458 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.210008378 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.210016191 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.210024666 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.210033910 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/final_layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.296212541 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_370'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.296270003 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_368'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.296279095 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_8_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.296296726 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Shape_3_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.297386573 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_314'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.297433644 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_7_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.297470013 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_316'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.297530880 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Shape_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.338965280 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_343'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.339025410 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.340208075 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_295'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.340237966 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.427374890 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/ConstantOfShape_1_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:48.428639761 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/ConstantOfShape_1_output_0'. It is not used by any node and should be removed from the model.\n", + "\t-[✓] ONNX model output names match reference model (present.2.encoder.key, present.0.decoder.key, present.8.decoder.value, present.11.encoder.key, logits, present.7.encoder.value, present.11.decoder.key, present.9.decoder.value, present.1.decoder.value, present.4.encoder.value, present.4.decoder.value, present.6.encoder.value, present.2.encoder.value, present.6.encoder.key, present.10.encoder.key, present.11.encoder.value, present.9.decoder.key, present.7.decoder.value, present.4.encoder.key, present.9.encoder.key, present.10.decoder.key, present.3.decoder.key, present.8.encoder.value, present.1.encoder.key, present.2.decoder.key, present.1.encoder.value, present.6.decoder.key, present.11.decoder.value, present.5.decoder.key, present.7.decoder.key, present.7.encoder.key, present.8.decoder.key, present.0.encoder.key, present.10.decoder.value, present.5.decoder.value, present.5.encoder.key, present.1.decoder.key, present.3.encoder.value, present.0.decoder.value, present.2.decoder.value, present.3.encoder.key, present.9.encoder.value, present.10.encoder.value, present.0.encoder.value, present.4.decoder.key, present.8.encoder.key, present.5.encoder.value, present.6.decoder.value, present.3.decoder.value)\n", + "\t- Validating ONNX Model output \"logits\":\n", + "\t\t-[✓] (2, 16, 32128) matches (2, 16, 32128)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 1.9073486328125e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.6.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.6.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.3603439331054688e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.6.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.6.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.7.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.7.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 1.811981201171875e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.7.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.7.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.8.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.8.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.193450927734375e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.8.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.8.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.9.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.9.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.9087066650390625e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.9.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.9.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.10.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.574920654296875e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.10.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 5.14984130859375e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.10.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.10.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.09808349609375e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.11.decoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.384185791015625e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.11.decoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[x] values not close enough, max diff: 4.1484832763671875e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.11.encoder.key\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.11.encoder.value\":\n", + "\t\t-[✓] (2, 12, 16, 64) matches (2, 12, 16, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "Validating ONNX model onnx_models/google/flan-t5-base/decoder_model_merged.onnx...\n", + "2023-12-09 15:51:51.066662621 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066740990 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066755838 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066766224 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066783678 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066792789 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066807718 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066828873 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066844908 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066873884 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066893121 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066902640 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066915783 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066928136 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066970164 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.066983878 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067010352 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067021723 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067031786 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067041416 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067053275 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067067646 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067084830 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067096963 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067114183 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067130196 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067146084 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067158086 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067170672 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067205436 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067221892 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067238753 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067264981 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067279333 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067290760 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067318324 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.067342512 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/final_layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068437655 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068472192 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068485686 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.11/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068513648 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068553098 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068581661 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068594421 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068608852 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068622518 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068646811 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068659238 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068676697 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068696910 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068709118 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068724187 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.4/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068744855 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068769920 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068783680 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068795337 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.2/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068814117 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068838786 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.1/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068849310 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.0/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068860510 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068882381 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068896055 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068910675 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.10/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068923158 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068946958 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.5/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068958596 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.3/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068968973 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.068995012 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.069013807 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.7/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.069027040 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.1/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.069057772 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.8/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.069069730 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.9/layer.0/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.069082472 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/block.6/layer.2/layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.069096216 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/final_layer_norm/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.161041863 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_370'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.161099643 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_368'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.161109168 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_8_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.161131091 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Shape_3_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.162257670 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_314'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.162316903 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_7_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.162365825 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_316'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.162445897 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Shape_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.203729989 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_343'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.203793895 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.205036049 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer 'onnx::Unsqueeze_295'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.205074263 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/Constant_2_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.288856449 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/ConstantOfShape_1_output_0'. It is not used by any node and should be removed from the model.\n", + "2023-12-09 15:51:51.290187950 [W:onnxruntime:, graph.cc:3553 CleanUnusedInitializersAndNodeArgs] Removing initializer '/decoder/ConstantOfShape_1_output_0'. It is not used by any node and should be removed from the model.\n", + "\t-[✓] ONNX model output names match reference model (present.0.decoder.key, present.8.decoder.value, logits, present.11.decoder.key, present.9.decoder.value, present.1.decoder.value, present.4.decoder.value, present.9.decoder.key, present.7.decoder.value, present.10.decoder.key, present.3.decoder.key, present.2.decoder.key, present.6.decoder.key, present.11.decoder.value, present.5.decoder.key, present.7.decoder.key, present.8.decoder.key, present.10.decoder.value, present.5.decoder.value, present.1.decoder.key, present.0.decoder.value, present.2.decoder.value, present.4.decoder.key, present.6.decoder.value, present.3.decoder.value)\n", + "\t- Validating ONNX Model output \"logits\":\n", + "\t\t-[✓] (2, 1, 32128) matches (2, 1, 32128)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.0.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.1.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.2.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.3.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.4.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 1.6510486602783203e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.5.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.0802021026611328e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.6.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.6.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 1.9609928131103516e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.7.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.7.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.6941299438476562e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.8.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[✓] all values close (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.8.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 3.266334533691406e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.9.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 1.811981201171875e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.9.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 5.53131103515625e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.10.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.086162567138672e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.10.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 6.723403930664062e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.11.decoder.key\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 2.8252601623535156e-05 (atol: 1e-05)\n", + "\t- Validating ONNX Model output \"present.11.decoder.value\":\n", + "\t\t-[✓] (2, 12, 17, 64) matches (2, 12, 17, 64)\n", + "\t\t-[x] values not close enough, max diff: 6.0439109802246094e-05 (atol: 1e-05)\n", + "Validation for the model onnx_models/google/flan-t5-base/decoder_model_merged.onnx raised: The maximum absolute difference between the output of the reference model and the ONNX exported model is not within the set tolerance 1e-05:\n", + "- present.4.decoder.value: max diff = 1.9073486328125e-05\n", + "- present.6.decoder.value: max diff = 2.3603439331054688e-05\n", + "- present.7.decoder.value: max diff = 1.811981201171875e-05\n", + "- present.8.decoder.value: max diff = 2.193450927734375e-05\n", + "- present.9.decoder.value: max diff = 2.9087066650390625e-05\n", + "- present.10.decoder.key: max diff = 2.574920654296875e-05\n", + "- present.10.decoder.value: max diff = 5.14984130859375e-05\n", + "- present.10.encoder.value: max diff = 2.09808349609375e-05\n", + "- present.11.decoder.key: max diff = 2.384185791015625e-05\n", + "- present.11.decoder.value: max diff = 4.1484832763671875e-05\n", + "The ONNX export succeeded with the warning: The maximum absolute difference between the output of the reference model and the ONNX exported model is not within the set tolerance 1e-05:\n", + "- present.4.decoder.value: max diff = 1.6510486602783203e-05\n", + "- present.5.decoder.value: max diff = 2.0802021026611328e-05\n", + "- present.6.decoder.value: max diff = 1.9609928131103516e-05\n", + "- present.7.decoder.value: max diff = 2.6941299438476562e-05\n", + "- present.8.decoder.value: max diff = 3.266334533691406e-05\n", + "- present.9.decoder.key: max diff = 1.811981201171875e-05\n", + "- present.9.decoder.value: max diff = 5.53131103515625e-05\n", + "- present.10.decoder.key: max diff = 2.086162567138672e-05\n", + "- present.10.decoder.value: max diff = 6.723403930664062e-05\n", + "- present.11.decoder.key: max diff = 2.8252601623535156e-05\n", + "- present.11.decoder.value: max diff = 6.0439109802246094e-05.\n", + " The exported model was saved at: onnx_models/google/flan-t5-base\n" + ] + } + ], + "source": [ + "# Export the model to ONNX using optimum\n", + "\n", + "# Export with optimizations (uncomment next line)\n", + "# !optimum-cli export onnx --task text2text-generation-with-past --model {MODEL_NAME} --optimize O2 {EXPORT_PATH}\n", + "# IMPORTANT - there is a bug in onnxruntime which crashes it when trying to optimize a T5 small model (or any derivative of it)\n", + "# There are two ways to addess the problem:\n", + "# 1. Go to onnx_model_bert.py in the onnxruntime module (the full path depends on the module version),\n", + "# find the BertOnnxModel class and comment the following line in the constructor:\n", + "# assert (num_heads == 0 and hidden_size == 0) or (num_heads > 0 and hidden_size % num_heads == 0)\n", + "# 2. Disable optimization by removing '--optimize O2' (use line below).\n", + "\n", + "# Export without optimizations\n", + "!optimum-cli export onnx --task text2text-generation-with-past --model {MODEL_NAME} {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2283400\n", + "-rw-r--r-- 1 root root 1529 Dec 9 15:50 config.json\n", + "-rw-r--r-- 1 root root 651182887 Dec 9 15:51 decoder_model_merged.onnx\n", + "-rw-r--r-- 1 root root 650848962 Dec 9 15:51 decoder_model.onnx\n", + "-rw-r--r-- 1 root root 594197310 Dec 9 15:51 decoder_with_past_model.onnx\n", + "-rw-r--r-- 1 root root 438697389 Dec 9 15:50 encoder_model.onnx\n", + "-rw-r--r-- 1 root root 142 Dec 9 15:50 generation_config.json\n", + "-rw-r--r-- 1 root root 2201 Dec 9 15:50 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 791656 Dec 9 15:50 spiece.model\n", + "-rw-r--r-- 1 root root 20771 Dec 9 15:50 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2422256 Dec 9 15:50 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- As you can see, we need to move the sentence piece models `spiece.model` from the tokenizer to assets folder which Spark NLP will look for" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! mkdir -p {EXPORT_PATH}/assets\n", + "! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/spiece.model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 776\n", + "-rw-r--r-- 1 root root 791656 Dec 9 15:50 spiece.model\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save T5 in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.2.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m548.5/548.5 kB\u001b[0m \u001b[31m41.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `T5Transformer` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `T5Transformer` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "T5 = T5Transformer.loadSavedModel(EXPORT_PATH, spark)\\\n", + " .setUseCache(True) \\\n", + " .setTask(\"summarize:\") \\\n", + " .setMaxOutputLength(200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "T5.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX T5 model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1065292\n", + "-rw-r--r-- 1 root root 651282390 Dec 9 16:07 decoder.onxx\n", + "-rw-r--r-- 1 root root 438764467 Dec 9 16:07 encoder.onxx\n", + "drwxr-xr-x 2 root root 4096 Dec 9 16:07 metadata\n", + "-rw-r--r-- 1 root root 791656 Dec 9 16:07 t5_spp\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny T5 model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------------------------------------+\n", + "|result |\n", + "+-----------------------------------------------------------------------------------------------------------+\n", + "|[We introduce a unified framework that converts text-to-text language problems into a text-to-text format.]|\n", + "+-----------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "test_data = spark.createDataFrame([\n", + " [\"Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a \" +\n", + " \"downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness\" +\n", + " \" of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this \" +\n", + " \"paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework \" +\n", + " \"that converts all text-based language problems into a text-to-text format. Our systematic study compares \" +\n", + " \"pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens \" +\n", + " \"of language understanding tasks. By combining the insights from our exploration with scale and our new \" +\n", + " \"Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering \" +\n", + " \"summarization, question answering, text classification, and more. To facilitate future work on transfer \" +\n", + " \"learning for NLP, we release our data set, pre-trained models, and code.\"]\n", + "]).toDF(\"text\")\n", + "\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "T5 = T5Transformer.load(f\"{MODEL_NAME}_spark_nlp\") \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setOutputCol(\"summary\")\n", + "\n", + "pipeline = Pipeline().setStages([document_assembler, T5])\n", + "\n", + "result = pipeline.fit(test_data).transform(test_data)\n", + "result.select(\"summary.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of T5 models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 05dda0769456186a6c1dc4ada08b917e012ecd23 Mon Sep 17 00:00:00 2001 From: David Cecchini Date: Wed, 27 Dec 2023 12:42:25 -0300 Subject: [PATCH 04/14] Added BGE Embeddings (#14090) * Added BGE Embeddings * Fixed class names --- .../annotator/embeddings/bge_embeddings.py | 192 +++++++ python/sparknlp/internal/__init__.py | 3 + .../scala/com/johnsnowlabs/ml/ai/BGE.scala | 247 +++++++++ .../com/johnsnowlabs/ml/util/LinAlg.scala | 8 +- .../nlp/embeddings/BGEEmbeddings.scala | 482 ++++++++++++++++++ .../embeddings/BGEEmbeddingsTestSpec.scala | 116 +++++ 6 files changed, 1047 insertions(+), 1 deletion(-) create mode 100644 python/sparknlp/annotator/embeddings/bge_embeddings.py create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/BGE.scala create mode 100644 src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala create mode 100644 src/test/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddingsTestSpec.scala diff --git a/python/sparknlp/annotator/embeddings/bge_embeddings.py b/python/sparknlp/annotator/embeddings/bge_embeddings.py new file mode 100644 index 00000000000000..0c0428141a3ec7 --- /dev/null +++ b/python/sparknlp/annotator/embeddings/bge_embeddings.py @@ -0,0 +1,192 @@ +# Copyright 2017-2022 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for BGEEmbeddings.""" + +from sparknlp.common import * + + +class BGEEmbeddings(AnnotatorModel, + HasEmbeddingsProperties, + HasCaseSensitiveProperties, + HasStorageRef, + HasBatchedAnnotate, + HasMaxSentenceLengthLimit): + """Sentence embeddings using BGE. + + BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense + vector which can be used for tasks like retrieval, classification, clustering, or semantic search. + + Pretrained models can be loaded with `pretrained` of the companion object: + + >>> embeddings = BGEEmbeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("bge_embeddings") + + + The default model is ``"bge_base"``, if no name is provided. + + For available pretrained models please see the + `Models Hub `__. + + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT`` ``SENTENCE_EMBEDDINGS`` + ====================== ====================== + + Parameters + ---------- + batchSize + Size of every batch , by default 8 + dimension + Number of embedding dimensions, by default 768 + caseSensitive + Whether to ignore case in tokens for embeddings matching, by default False + maxSentenceLength + Max sentence length to process, by default 512 + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + + References + ---------- + `C-Pack: Packaged Resources To Advance General Chinese Embedding `__ + `BGE Github Repository `__ + + **Paper abstract** + + *We introduce C-Pack, a package of resources that significantly advance the field of general + Chinese embeddings. C-Pack includes three critical resources. + 1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets. + 2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora + for training embedding models. + 3) C-TEM is a family of embedding models covering multiple sizes. + Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the + time of the release. We also integrate and optimize the entire suite of training methods for + C-TEM. Along with our resources on general Chinese embedding, we release our data and models for + English text embeddings. The English models achieve stateof-the-art performance on the MTEB + benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All + these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.* + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> embeddings = BGEEmbeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("bge_embeddings") + >>> embeddingsFinisher = EmbeddingsFinisher() \\ + ... .setInputCols(["bge_embeddings"]) \\ + ... .setOutputCols("finished_embeddings") \\ + ... .setOutputAsVector(True) + >>> pipeline = Pipeline().setStages([ + ... documentAssembler, + ... embeddings, + ... embeddingsFinisher + ... ]) + >>> data = spark.createDataFrame([["query: how much protein should a female eat", + ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \ + ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \ + ... "marathon. Check out the chart below to see how much protein you should be eating each day.", + ... ]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80) + +--------------------------------------------------------------------------------+ + | result| + +--------------------------------------------------------------------------------+ + |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| + |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| + +--------------------------------------------------------------------------------+ + """ + + name = "BGEEmbeddings" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + + outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + configProtoBytes = Param(Params._dummy(), + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", + TypeConverters.toListInt) + + + def setConfigProtoBytes(self, b): + """Sets configProto from tensorflow, serialized into byte array. + + Parameters + ---------- + b : List[int] + ConfigProto from tensorflow, serialized into byte array + """ + return self._set(configProtoBytes=b) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.BGEEmbeddings", java_model=None): + super(BGEEmbeddings, self).__init__( + classname=classname, + java_model=java_model + ) + self._setDefault( + dimension=768, + batchSize=8, + maxSentenceLength=512, + caseSensitive=False, + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + BGEEmbeddings + The restored model + """ + from sparknlp.internal import _BGELoader + jModel = _BGELoader(folder, spark_session._jsparkSession)._java_obj + return BGEEmbeddings(java_model=jModel) + + @staticmethod + def pretrained(name="bge_base", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "bge_base" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + BGEEmbeddings + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(BGEEmbeddings, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 80e3749e323875..f49a5e4768deab 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -147,6 +147,9 @@ class _E5Loader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_E5Loader, self).__init__("com.johnsnowlabs.nlp.embeddings.E5Embeddings.loadSavedModel", path, jspark) +class _BGELoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_BGELoader, self).__init__("com.johnsnowlabs.nlp.embeddings.BGEEmbeddings.loadSavedModel", path, jspark) class _GPT2Loader(ExtendedJavaWrapper): def __init__(self, path, jspark): diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/BGE.scala b/src/main/scala/com/johnsnowlabs/ml/ai/BGE.scala new file mode 100644 index 00000000000000..fb421b1fd58ebf --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/ai/BGE.scala @@ -0,0 +1,247 @@ +/* + * Copyright 2017 - 2023 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.ml.ai + +import ai.onnxruntime.{OnnxTensor, TensorInfo} +import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} +import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} +import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{LinAlg, ONNX, TensorFlow} +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} + +import scala.collection.JavaConverters._ + +/** BGE Sentence embeddings model + * @param tensorflowWrapper + * tensorflow wrapper + * @param configProtoBytes + * config proto bytes + * @param sentenceStartTokenId + * sentence start token id + * @param sentenceEndTokenId + * sentence end token id + * @param signatures + * signatures + */ +private[johnsnowlabs] class BGE( + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], + configProtoBytes: Option[Array[Byte]] = None, + sentenceStartTokenId: Int, + sentenceEndTokenId: Int, + signatures: Option[Map[String, String]] = None) + extends Serializable { + + private val _tfInstructorSignatures: Map[String, String] = + signatures.getOrElse(ModelSignatureManager.apply()) + private val paddingTokenId = 0 + + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions + + /** Get sentence embeddings for a batch of sentences + * @param batch + * batch of sentences + * @return + * sentence embeddings + */ + private def getSentenceEmbedding(batch: Seq[Array[Int]]): Array[Array[Float]] = { + val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max + val paddedBatch = batch.map(arr => padArrayWithZeros(arr, maxSentenceLength)) + val embeddings = detectedEngine match { + case ONNX.name => + getSentenceEmbeddingFromOnnx(paddedBatch, maxSentenceLength) + case _ => + getSentenceEmbeddingFromTF(paddedBatch, maxSentenceLength) + } + embeddings + } + + private def padArrayWithZeros(arr: Array[Int], maxLength: Int): Array[Int] = { + if (arr.length >= maxLength) { + arr + } else { + arr ++ Array.fill(maxLength - arr.length)(0) + } + } + + private def getSentenceEmbeddingFromTF( + batch: Seq[Array[Int]], + maxSentenceLength: Int): Array[Array[Float]] = { + val batchLength = batch.length + + // encode batch + val tensorEncoder = new TensorResources() + val inputDim = batch.length * maxSentenceLength + + // create buffers + val encoderInputBuffers = tensorEncoder.createIntBuffer(inputDim) + val encoderAttentionMaskBuffers = tensorEncoder.createIntBuffer(inputDim) + + val shape = Array(batch.length.toLong, maxSentenceLength) + + batch.zipWithIndex.foreach { case (tokenIds, idx) => + val offset = idx * maxSentenceLength + val diff = maxSentenceLength - tokenIds.length + + // pad with 0 + val s = tokenIds.take(maxSentenceLength) ++ Array.fill[Int](diff)(this.paddingTokenId) + encoderInputBuffers.offset(offset).write(s) + + // create attention mask + val mask = s.map(x => if (x != this.paddingTokenId) 1 else 0) + encoderAttentionMaskBuffers.offset(offset).write(mask) + + } + + // create tensors + val encoderInputTensors = tensorEncoder.createIntBufferTensor(shape, encoderInputBuffers) + val encoderAttentionMaskTensors = + tensorEncoder.createIntBufferTensor(shape, encoderAttentionMaskBuffers) + + // run model + val runner = tensorflowWrapper.get + .getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + initAllTables = false, + savedSignatures = signatures) + .runner + + runner + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderInputIds.key, + "missing_encoder_input_ids"), + encoderInputTensors) + .feed( + _tfInstructorSignatures.getOrElse( + ModelSignatureConstants.EncoderAttentionMask.key, + "missing_encoder_attention_mask"), + encoderAttentionMaskTensors) + .fetch(_tfInstructorSignatures + .getOrElse(ModelSignatureConstants.LastHiddenState.key, "missing_last_hidden_state")) + + // get embeddings + val sentenceEmbeddings = runner.run().asScala + val sentenceEmbeddingsFloats = TensorResources.extractFloats(sentenceEmbeddings.head) + val dim = sentenceEmbeddingsFloats.length / batchLength + + // group embeddings + val sentenceEmbeddingsFloatsArray = sentenceEmbeddingsFloats.grouped(dim).toArray + + // close buffers + sentenceEmbeddings.foreach(_.close()) + encoderInputTensors.close() + encoderAttentionMaskTensors.close() + tensorEncoder.clearTensors() + tensorEncoder.clearSession(sentenceEmbeddings) + + sentenceEmbeddingsFloatsArray + } + + private def getSentenceEmbeddingFromOnnx( + batch: Seq[Array[Int]], + maxSentenceLength: Int): Array[Array[Float]] = { + + val inputIds = batch.map(x => x.map(x => x.toLong)).toArray + val attentionMask = batch.map(sentence => sentence.map(x => if (x < 0L) 0L else 1L)).toArray + + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = OnnxTensor.createTensor(env, inputIds) + val maskTensors = OnnxTensor.createTensor(env, attentionMask) + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + val lastHiddenState = results.get("last_hidden_state").get() + val info = lastHiddenState.getInfo.asInstanceOf[TensorInfo] + val shape = info.getShape + try { + val embeddings = lastHiddenState + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + + val dim = shape.last.toInt + // Perfom CLS pooling (the first element of each sequence) + val clsPooling = embeddings.grouped(dim).map(_.head).toArray + val normalizedSentenceEmbeddings = LinAlg.lpNormalizeArray(clsPooling, 2) + + Array(normalizedSentenceEmbeddings) + } finally if (results != null) results.close() + } + } + + /** Predict sentence embeddings for a batch of sentences + * @param sentences + * sentences + * @param tokenizedSentences + * tokenized sentences + * @param batchSize + * batch size + * @param maxSentenceLength + * max sentence length + * @return + */ + def predict( + sentences: Seq[Annotation], + tokenizedSentences: Seq[WordpieceTokenizedSentence], + batchSize: Int, + maxSentenceLength: Int): Seq[Annotation] = { + + tokenizedSentences + .zip(sentences) + .zipWithIndex + .grouped(batchSize) + .toArray + .flatMap { batch => + val tokensBatch = batch.map(x => x._1._1.tokens) + val tokens = tokensBatch.map(x => + Array(sentenceStartTokenId) ++ x + .map(y => y.pieceId) + .take(maxSentenceLength - 2) ++ Array(sentenceEndTokenId)) + + val sentenceEmbeddings = getSentenceEmbedding(tokens) + + batch.zip(sentenceEmbeddings).map { case (sentence, vectors) => + Annotation( + annotatorType = AnnotatorType.SENTENCE_EMBEDDINGS, + begin = sentence._1._2.begin, + end = sentence._1._2.end, + result = sentence._1._2.result, + metadata = sentence._1._2.metadata, + embeddings = vectors) + } + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala b/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala index 266bc6a69a46aa..cf23c78a83427a 100644 --- a/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala +++ b/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala @@ -1,7 +1,7 @@ package com.johnsnowlabs.ml.util import breeze.linalg.{DenseMatrix, tile} -import scala.math.sqrt +import scala.math.{sqrt, pow} object LinAlg { @@ -130,4 +130,10 @@ object LinAlg { array.map(value => if (l2Norm != 0.0f) value / l2Norm else 0.0f) } + def lpNormalizeArray(array: Array[Float], p: Int = 2): Array[Float] = { + val lpNorm: Float = pow(array.map(x => pow(x, p)).sum, 1.0 / p).toFloat + // Normalize each element in the array + array.map(value => if (lpNorm != 0.0f) value / lpNorm else 0.0f) + } + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala new file mode 100644 index 00000000000000..8fb3bba9d3ba40 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.scala @@ -0,0 +1,482 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.ml.ai.BGE +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.tensorflow._ +import com.johnsnowlabs.ml.util.LoadExternalModel.{ + loadTextAsset, + modelSanityCheck, + notSupportedEngineError +} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} +import com.johnsnowlabs.nlp.serialization.MapFeature +import com.johnsnowlabs.storage.HasStorageRef +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param._ +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +/** Sentence embeddings using BGE. + * + * BGE, or BAAI General Embeddings, a model that can map any text to a low-dimensional dense + * vector which can be used for tasks like retrieval, classification, clustering, or semantic search. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val embeddings = BGEEmbeddings.pretrained() + * .setInputCols("document") + * .setOutputCol("embeddings") + * }}} + * The default model is `"bge_base"`, if no name is provided. + * + * For available pretrained models please see the + * [[https://sparknlp.org/models?q=BGE Models Hub]]. + * + * For extended examples of usage, see + * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddingsTestSpec.scala BGEEmbeddingsTestSpec]]. + * + * '''Sources''' : + * + * [[https://arxiv.org/pdf/2309.07597 C-Pack: Packaged Resources To Advance General Chinese Embedding]] + * + * [[https://github.com/FlagOpen/FlagEmbedding BGE Github Repository]] + * + * ''' Paper abstract ''' + * + * ''We introduce C-Pack, a package of resources that significantly advance the field of general + * Chinese embeddings. C-Pack includes three critical resources. + * 1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 tasks and 35 datasets. + * 2) C-MTP is a massive text embedding dataset curated from labeled and unlabeled Chinese corpora + * for training embedding models. + * 3) C-TEM is a family of embedding models covering multiple sizes. + * Our models outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the + * time of the release. We also integrate and optimize the entire suite of training methods for + * C-TEM. Along with our resources on general Chinese embedding, we release our data and models for + * English text embeddings. The English models achieve stateof-the-art performance on the MTEB + * benchmark; meanwhile, our released English data is 2 times larger than the Chinese data. All + * these resources are made publicly available at https://github.com/FlagOpen/FlagEmbedding.'' + * + * ==Example== + * {{{ + * import spark.implicits._ + * import com.johnsnowlabs.nlp.base.DocumentAssembler + * import com.johnsnowlabs.nlp.annotators.Tokenizer + * import com.johnsnowlabs.nlp.embeddings.BGEEmbeddings + * import com.johnsnowlabs.nlp.EmbeddingsFinisher + * import org.apache.spark.ml.Pipeline + * + * val documentAssembler = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val embeddings = BGEEmbeddings.pretrained("bge_base", "en") + * .setInputCols("document") + * .setOutputCol("bge_embeddings") + * + * val embeddingsFinisher = new EmbeddingsFinisher() + * .setInputCols("bge_embeddings") + * .setOutputCols("finished_embeddings") + * .setOutputAsVector(true) + * + * val pipeline = new Pipeline().setStages(Array( + * documentAssembler, + * embeddings, + * embeddingsFinisher + * )) + * + * val data = Seq("query: how much protein should a female eat", + * "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + + * But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + + * marathon. Check out the chart below to see how much protein you should be eating each day." + * + * ).toDF("text") + * val result = pipeline.fit(data).transform(data) + * + * result.selectExpr("explode(finished_embeddings) as result").show(1, 80) + * +--------------------------------------------------------------------------------+ + * | result| + * +--------------------------------------------------------------------------------+ + * |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...| + * [[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...| + * +--------------------------------------------------------------------------------+ + * }}} + * + * @see + * [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer + * based embeddings + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class BGEEmbeddings(override val uid: String) + extends AnnotatorModel[BGEEmbeddings] + with HasBatchedAnnotate[BGEEmbeddings] + with WriteTensorflowModel + with WriteOnnxModel + with HasEmbeddingsProperties + with HasStorageRef + with HasCaseSensitiveProperties + with HasEngine { + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + override val inputAnnotatorTypes: Array[String] = + Array(AnnotatorType.DOCUMENT) + override val outputAnnotatorType: AnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + + /** ConfigProto from tensorflow, serialized into byte array. Get with + * `config_proto.SerializeToString()` + * + * @group param + */ + val configProtoBytes = new IntArrayParam( + this, + "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()") + + /** Max sentence length to process (Default: `128`) + * + * @group param + */ + val maxSentenceLength = + new IntParam(this, "maxSentenceLength", "Max sentence length to process") + + def sentenceStartTokenId: Int = { + $$(vocabulary)("[CLS]") + } + + /** @group setParam */ + def sentenceEndTokenId: Int = { + $$(vocabulary)("[SEP]") + } + + /** Vocabulary used to encode the words to ids with WordPieceEncoder + * + * @group param + */ + val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected() + + /** @group setParam */ + def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value) + + /** It contains TF model signatures for the laded saved model + * + * @group param + */ + val signatures = + new MapFeature[String, String](model = this, name = "signatures").setProtected() + private var _model: Option[Broadcast[BGE]] = None + + def this() = this(Identifiable.randomUID("BGE_EMBEDDINGS")) + + /** @group setParam */ + def setConfigProtoBytes(bytes: Array[Int]): BGEEmbeddings.this.type = + set(this.configProtoBytes, bytes) + + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = { + require( + value <= 512, + "BGE models do not support sequences longer than 512 because of trainable positional embeddings.") + require(value >= 1, "The maxSentenceLength must be at least 1") + set(maxSentenceLength, value) + this + } + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + + /** @group setParam */ + def setSignatures(value: Map[String, String]): this.type = { + if (get(signatures).isEmpty) + set(signatures, value) + this + } + + /** @group setParam */ + def setModelIfNotSet( + spark: SparkSession, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): BGEEmbeddings = { + if (_model.isEmpty) { + _model = Some( + spark.sparkContext.broadcast( + new BGE( + tensorflowWrapper, + onnxWrapper, + configProtoBytes = getConfigProtoBytes, + sentenceStartTokenId = sentenceStartTokenId, + sentenceEndTokenId = sentenceEndTokenId, + signatures = getSignatures))) + } + + this + } + + /** Set Embeddings dimensions for the BERT model Only possible to set this when the first time + * is saved dimension is not changeable, it comes from BERT config file + * + * @group setParam + */ + override def setDimension(value: Int): this.type = { + if (get(dimension).isEmpty) + set(this.dimension, value) + this + } + + /** Whether to lowercase tokens or not + * + * @group setParam + */ + override def setCaseSensitive(value: Boolean): this.type = { + if (get(caseSensitive).isEmpty) + set(this.caseSensitive, value) + this + } + + setDefault(dimension -> 768, batchSize -> 8, maxSentenceLength -> 512, caseSensitive -> false) + + def tokenize(sentences: Seq[Annotation]): Seq[WordpieceTokenizedSentence] = { + val basicTokenizer = new BasicTokenizer($(caseSensitive)) + val encoder = new WordpieceEncoder($$(vocabulary)) + sentences.map { s => + val sent = Sentence( + content = s.result, + start = s.begin, + end = s.end, + metadata = Some(s.metadata), + index = s.begin) + val tokens = basicTokenizer.tokenize(sent) + val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)) + WordpieceTokenizedSentence(wordpieceTokens) + } + } + + /** takes a document and annotations and produces new annotations of this annotator's annotation + * type + * + * @param batchedAnnotations + * Annotations that correspond to inputAnnotationCols generated by previous annotators if any + * @return + * any number of annotations processed for every input annotation. Not necessary one to one + * relationship + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + + val allAnnotations = batchedAnnotations + .filter(_.nonEmpty) + .zipWithIndex + .flatMap { case (annotations, i) => + annotations.filter(_.result.nonEmpty).map(x => (x, i)) + } + + // Tokenize sentences + val tokenizedSentences = tokenize(allAnnotations.map(_._1)) + val processedAnnotations = if (allAnnotations.nonEmpty) { + this.getModelIfNotSet.predict( + sentences = allAnnotations.map(_._1), + tokenizedSentences = tokenizedSentences, + batchSize = $(batchSize), + maxSentenceLength = $(maxSentenceLength)) + } else { + Seq() + } + + // Group resulting annotations by rows. If there are not sentences in a given row, return empty sequence + batchedAnnotations.indices.map(rowIndex => { + val rowAnnotations = processedAnnotations + // zip each annotation with its corresponding row index + .zip(allAnnotations) + // select the sentences belonging to the current row + .filter(_._2._2 == rowIndex) + // leave the annotation only + .map(_._1) + + if (rowAnnotations.nonEmpty) + rowAnnotations + else + Seq.empty[Annotation] + }) + + } + + /** @group getParam */ + def getModelIfNotSet: BGE = _model.get.value + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + val suffix = "_bge" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + BGEEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes, + savedSignatures = getSignatures) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + BGEEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } + } + + /** @group getParam */ + def getConfigProtoBytes: Option[Array[Byte]] = get(this.configProtoBytes).map(_.map(_.toByte)) + + /** @group getParam */ + def getSignatures: Option[Map[String, String]] = get(this.signatures) + + override protected def afterAnnotate(dataset: DataFrame): DataFrame = { + dataset.withColumn( + getOutputCol, + wrapSentenceEmbeddingsMetadata( + dataset.col(getOutputCol), + $(dimension), + Some($(storageRef)))) + } + +} + +trait ReadablePretrainedBGEModel + extends ParamsAndFeaturesReadable[BGEEmbeddings] + with HasPretrained[BGEEmbeddings] { + override val defaultModelName: Some[String] = Some("bge_base") + + /** Java compliant-overrides */ + override def pretrained(): BGEEmbeddings = super.pretrained() + + override def pretrained(name: String): BGEEmbeddings = super.pretrained(name) + + override def pretrained(name: String, lang: String): BGEEmbeddings = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): BGEEmbeddings = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadBGEDLModel extends ReadTensorflowModel with ReadOnnxModel { + this: ParamsAndFeaturesReadable[BGEEmbeddings] => + + override val tfFile: String = "bge_tensorflow" + override val onnxFile: String = "bge_onnx" + + def readModel(instance: BGEEmbeddings, path: String, spark: SparkSession): Unit = { + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_bge_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => + val onnxWrapper = + readOnnxModel(path, spark, "_bge_onnx", zipped = true, useBundle = false, None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper)) + + case _ => + throw new Exception(notSupportedEngineError) + } + + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): BGEEmbeddings = { + + val (localModelPath, detectedEngine) = modelSanityCheck(modelPath) + + val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap + + /*Universal parameters for all engines*/ + val annotatorModel = new BGEEmbeddings() + .setVocabulary(vocabs) + + annotatorModel.set(annotatorModel.engine, detectedEngine) + + detectedEngine match { + case TensorFlow.name => + val (wrapper, signatures) = + TensorflowWrapper.read( + localModelPath, + zipped = false, + useBundle = true, + tags = Array("serve"), + initAllTables = false) + + val _signatures = signatures match { + case Some(s) => s + case None => throw new Exception("Cannot load signature definitions from model!") + } + + /** the order of setSignatures is important if we use getSignatures inside + * setModelIfNotSet + */ + annotatorModel + .setSignatures(_signatures) + .setModelIfNotSet(spark, Some(wrapper), None) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper)) + + case _ => + throw new Exception(notSupportedEngineError) + } + + annotatorModel + } +} + +/** This is the companion object of [[BGEEmbeddings]]. Please refer to that class for the + * documentation. + */ +object BGEEmbeddings extends ReadablePretrainedBGEModel with ReadBGEDLModel { + private[BGEEmbeddings] val logger: Logger = + LoggerFactory.getLogger("BGEEmbeddings") +} diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddingsTestSpec.scala new file mode 100644 index 00000000000000..77e92795b8bacd --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/BGEEmbeddingsTestSpec.scala @@ -0,0 +1,116 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.functions.{col, size} +import org.scalatest.flatspec.AnyFlatSpec + +class BGEEmbeddingsTestSpec extends AnyFlatSpec { + + "BGE Embeddings" should "correctly embed multiple sentences" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val ddd = Seq( + "query: how much protein should a female eat", + "query: summit define", + "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 " + + "grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or" + + " training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "passage: Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of" + + " a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more" + + " governments.") + .toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val embeddings = BGEEmbeddings + .pretrained() + .setInputCols(Array("document")) + .setOutputCol("bge") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) + + val pipelineDF = pipeline.fit(ddd).transform(ddd) + pipelineDF.select("bge.embeddings").show(truncate = false) + + } + + it should "have embeddings of the same size" taggedAs SlowTest in { + import ResourceHelper.spark.implicits._ + val testDf = Seq( + "I like apples", + "I like bananas \\n and other things \\n like icream \\n and cats", + "I like rockets") + .toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val embeddings = BGEEmbeddings + .pretrained() + .setInputCols(Array("document")) + .setOutputCol("bge") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) + + val pipelineDF = pipeline.fit(testDf).transform(testDf) + + val embeddingsDF = pipelineDF.withColumn("embeddings", col("bge.embeddings").getItem(0)) + + val sizesArray: Array[Int] = embeddingsDF + .select(size(col("embeddings")).as("size")) + .collect() + .map(row => row.getAs[Int]("size")) + + assert(sizesArray.forall(_ == sizesArray.head)) + } + + it should "work with sentences" taggedAs SlowTest in { + import ResourceHelper.spark.implicits._ + val testData = "I really enjoy my job. This is amazing" + val testDf = Seq(testData).toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val sentenceDetectorDL = SentenceDetectorDLModel + .pretrained("sentence_detector_dl", "en") + .setInputCols(Array("document")) + .setOutputCol("sentences") + + val embeddings = BGEEmbeddings + .pretrained() + .setInputCols(Array("sentences")) + .setOutputCol("bge") + + val pipeline = new Pipeline().setStages(Array(document, sentenceDetectorDL, embeddings)) + + val pipelineDF = pipeline.fit(testDf).transform(testDf) + pipelineDF.select("bge.embeddings").show(false) + } + +} From 6a3623faf2205bee42db442fdbfbb5240f667629 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 27 Dec 2023 20:43:04 +0500 Subject: [PATCH 05/14] adding onnx support to DeberatForXXX annotators (#14096) --- .../ml/ai/DeBertaClassification.scala | 184 ++++++++++++------ .../dl/DeBertaForQuestionAnswering.scala | 80 +++++--- .../dl/DeBertaForSequenceClassification.scala | 73 +++++-- .../dl/DeBertaForTokenClassification.scala | 90 ++++++--- 4 files changed, 294 insertions(+), 133 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala index 5022105f47d588..32e2397e2c74e2 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala @@ -16,12 +16,16 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor +import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} -import org.tensorflow.ndarray.buffer.IntDataBuffer +import org.tensorflow.ndarray.buffer +import org.tensorflow.ndarray.buffer.{IntDataBuffer, LongDataBuffer} import scala.collection.JavaConverters._ @@ -37,7 +41,8 @@ import scala.collection.JavaConverters._ * TF v2 signatures in Spark NLP */ private[johnsnowlabs] class DeBertaClassification( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], val spp: SentencePieceWrapper, configProtoBytes: Option[Array[Byte]] = None, tags: Map[String, Int], @@ -48,6 +53,11 @@ private[johnsnowlabs] class DeBertaClassification( val _tfDeBertaSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions // keys representing the input and output tensors of the DeBERTa model protected val sentencePadTokenId: Int = spp.getSppModel.pieceToId("[PAD]") @@ -95,59 +105,13 @@ private[johnsnowlabs] class DeBertaClassification( } def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] = { - val tensors = new TensorResources() - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - - // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) - - batch.zipWithIndex - .foreach { case (sentence, idx) => - val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) - maskBuffers - .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) - segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0)) - } - - val runner = tensorflowWrapper - .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) - .runner - - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) - - runner - .feed( - _tfDeBertaSignatures.getOrElse( - ModelSignatureConstants.InputIds.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfDeBertaSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfDeBertaSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIds.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfDeBertaSignatures - .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) - - val outs = runner.run().asScala - val rawScores = TensorResources.extractFloats(outs.head) - - outs.foreach(_.close()) - tensors.clearSession(outs) - tensors.clearTensors() + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch) + case _ => getRawScoresWithTF(batch) + } val dim = rawScores.length / (batchLength * maxSentenceLength) val batchScores: Array[Array[Array[Float]]] = rawScores @@ -160,7 +124,7 @@ private[johnsnowlabs] class DeBertaClassification( batchScores } - def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { + private def getRawScoresWithTF(batch: Seq[Array[Int]]): Array[Float] = { val tensors = new TensorResources() val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max @@ -183,7 +147,7 @@ private[johnsnowlabs] class DeBertaClassification( segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0)) } - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner @@ -215,6 +179,51 @@ private[johnsnowlabs] class DeBertaClassification( tensors.clearSession(outs) tensors.clearTensors() + rawScores + } + + + private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + + embeddings + } finally if (results != null) results.close() + } + } + + def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { + + val batchLength = batch.length + + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch) + case _ => getRawScoresWithTF(batch) + } + val dim = rawScores.length / batchLength val batchScores: Array[Array[Float]] = rawScores @@ -237,6 +246,25 @@ private[johnsnowlabs] class DeBertaClassification( activation: String): Array[Array[Float]] = ??? def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { + val batchLength = batch.length + val (startLogits, endLogits) = detectedEngine match { + case ONNX.name => computeLogitsWithOnnx(batch) + case _ => computeLogitsWithTF(batch) + } + + val endDim = endLogits.length / batchLength + val endScores: Array[Array[Float]] = + endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + + val startDim = startLogits.length / batchLength + val startScores: Array[Array[Float]] = + startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + + (startScores, endScores) + } + + + private def computeLogitsWithTF(batch: Seq[Array[Int]]): (Array[Float], Array[Float])={ val tensors = new TensorResources() val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max @@ -257,7 +285,7 @@ private[johnsnowlabs] class DeBertaClassification( .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) } - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner @@ -286,15 +314,47 @@ private[johnsnowlabs] class DeBertaClassification( tensors.clearSession(outs) tensors.clearTensors() - val endDim = endLogits.length / batchLength - val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + (startLogits, endLogits) + } - val startDim = startLogits.length / batchLength - val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray - (startScores, endScores) + private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { + // [nb of encoded sentences] + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val output = runner.run(inputs) + try { + val startLogits = output + .get("start_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + val endLogits = output + .get("end_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + tokenTensors.close() + maskTensors.close() + + (startLogits, endLogits) + } finally if (output != null) output.close() + } } def findIndexedToken( diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala index 600b85da999a6d..a9a12a21aa7506 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForQuestionAnswering.scala @@ -17,18 +17,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.{DeBertaClassification, MergeTokenStrategy} +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -116,6 +109,7 @@ class DeBertaForQuestionAnswering(override val uid: String) extends AnnotatorModel[DeBertaForQuestionAnswering] with HasBatchedAnnotate[DeBertaForQuestionAnswering] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -196,13 +190,15 @@ class DeBertaForQuestionAnswering(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): DeBertaForQuestionAnswering = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new DeBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = Map.empty[String, Int], @@ -253,13 +249,26 @@ class DeBertaForQuestionAnswering(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_deberta_classification", - DeBertaForQuestionAnswering.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_deberta_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + DeBertaForQuestionAnswering.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + DeBertaForQuestionAnswering.onnxFile) + } + writeSentencePieceModel( path, spark, @@ -292,21 +301,38 @@ trait ReadablePretrainedDeBertaForQAModel trait ReadDeBertaForQuestionAnsweringDLModel extends ReadTensorflowModel + with ReadOnnxModel with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[DeBertaForQuestionAnswering] => override val tfFile: String = "deberta_classification_tensorflow" + override val onnxFile: String = "camembert_classification_onnx" override val sppFile: String = "deberta_spp" def readModel( instance: DeBertaForQuestionAnswering, path: String, spark: SparkSession): Unit = { - - val tf = - readTensorflowModel(path, spark, "_deberta_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_deberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "_deberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_deberta_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -324,7 +350,7 @@ trait ReadDeBertaForQuestionAnsweringDLModel detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -337,7 +363,11 @@ trait ReadDeBertaForQuestionAnsweringDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala index 0f025ebca7c367..328bc5447edeca 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForSequenceClassification.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.DeBertaClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -29,7 +30,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -122,7 +123,7 @@ import org.apache.spark.sql.SparkSession */ class DeBertaForSequenceClassification(override val uid: String) extends AnnotatorModel[DeBertaForSequenceClassification] - with HasBatchedAnnotate[DeBertaForSequenceClassification] + with HasBatchedAnnotate[DeBertaForSequenceClassification] with WriteOnnxModel with WriteTensorflowModel with WriteSentencePieceModel with HasCaseSensitiveProperties @@ -238,13 +239,15 @@ class DeBertaForSequenceClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): DeBertaForSequenceClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new DeBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -305,13 +308,26 @@ class DeBertaForSequenceClassification(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_deberta_classification", - DeBertaForSequenceClassification.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_deberta_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + DeBertaForSequenceClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + DeBertaForSequenceClassification.onnxFile) + } + writeSentencePieceModel( path, spark, @@ -342,21 +358,40 @@ trait ReadablePretrainedDeBertaForSequenceModel super.pretrained(name, lang, remoteLoc) } -trait ReadDeBertaForSequenceDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadDeBertaForSequenceDLModel + extends ReadTensorflowModel + with ReadOnnxModel + with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[DeBertaForSequenceClassification] => override val tfFile: String = "deberta_classification_tensorflow" + override val onnxFile: String = "deberta_classification_onnx" override val sppFile: String = "deberta_spp" def readModel( instance: DeBertaForSequenceClassification, path: String, spark: SparkSession): Unit = { - - val tf = - readTensorflowModel(path, spark, "_deberta_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_deberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "_deberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_deberta_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -375,7 +410,7 @@ trait ReadDeBertaForSequenceDLModel extends ReadTensorflowModel with ReadSentenc detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -388,8 +423,12 @@ trait ReadDeBertaForSequenceDLModel extends ReadTensorflowModel with ReadSentenc */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala index 81b3fdff7def4b..43f10690e104ea 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/DeBertaForTokenClassification.scala @@ -17,19 +17,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.DeBertaClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -124,6 +116,7 @@ class DeBertaForTokenClassification(override val uid: String) extends AnnotatorModel[DeBertaForTokenClassification] with HasBatchedAnnotate[DeBertaForTokenClassification] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -218,13 +211,15 @@ class DeBertaForTokenClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): DeBertaForTokenClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new DeBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -277,13 +272,26 @@ class DeBertaForTokenClassification(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_deberta_classification", - DeBertaForTokenClassification.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_deberta_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + DeBertaForTokenClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + DeBertaForTokenClassification.onnxFile) + } + writeSentencePieceModel( path, spark, @@ -313,20 +321,40 @@ trait ReadablePretrainedDeBertaForTokenModel remoteLoc: String): DeBertaForTokenClassification = super.pretrained(name, lang, remoteLoc) } -trait ReadDeBertaForTokenDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadDeBertaForTokenDLModel + extends ReadTensorflowModel + with ReadOnnxModel + with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[DeBertaForTokenClassification] => override val tfFile: String = "deberta_classification_tensorflow" + override val onnxFile: String = "deberta_classification_onnx" override val sppFile: String = "deberta_spp" def readModel( - instance: DeBertaForTokenClassification, - path: String, - spark: SparkSession): Unit = { - - val tf = readTensorflowModel(path, spark, "_deberta_classification_tf", initAllTables = false) + instance: DeBertaForTokenClassification, + path: String, + spark: SparkSession): Unit = { val spp = readSentencePieceModel(path, spark, "_deberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "_deberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_deberta_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -344,7 +372,7 @@ trait ReadDeBertaForTokenDLModel extends ReadTensorflowModel with ReadSentencePi detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -357,7 +385,11 @@ trait ReadDeBertaForTokenDLModel extends ReadTensorflowModel with ReadSentencePi */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) From 1d1d0bf823e1366bd648d3db8e8c22aee00ce282 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Wed, 27 Dec 2023 10:45:37 -0500 Subject: [PATCH 06/14] [SPARKNLP-957] Solves average pooling computation when processing batches (#14104) Co-authored-by: Maziyar Panahi --- .../scala/com/johnsnowlabs/ml/ai/E5.scala | 10 +- .../scala/com/johnsnowlabs/ml/ai/MPNet.scala | 10 +- .../com/johnsnowlabs/ml/util/LinAlg.scala | 208 +++++++++++++++--- .../com/johnsnowlabs/ml/util/LinAlgTest.scala | 81 ++++++- .../nlp/embeddings/E5EmbeddingsTestSpec.scala | 31 +++ .../embeddings/MPNetEmbeddingsTestSpec.scala | 31 +++ 6 files changed, 325 insertions(+), 46 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala b/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala index 4104948e2c2644..adf56a0cc6969a 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/E5.scala @@ -183,7 +183,7 @@ private[johnsnowlabs] class E5( val info = lastHiddenState.getInfo.asInstanceOf[TensorInfo] val shape = info.getShape try { - val embeddings = lastHiddenState + val flattenEmbeddings = lastHiddenState .asInstanceOf[OnnxTensor] .getFloatBuffer .array() @@ -191,11 +191,9 @@ private[johnsnowlabs] class E5( maskTensors.close() segmentTensors.close() - val dim = shape.last.toInt - val avgPooling = LinAlg.avgPooling(embeddings, attentionMask(0), dim) - val normalizedSentenceEmbeddings = LinAlg.normalizeArray(avgPooling) - - Array(normalizedSentenceEmbeddings) + val embeddings = LinAlg.avgPooling(flattenEmbeddings, attentionMask, shape) + val normalizedEmbeddings = LinAlg.l2Normalize(embeddings) + LinAlg.denseMatrixToArray(normalizedEmbeddings) } finally if (results != null) results.close() } } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala b/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala index 3d48e622f908f2..989e8d083452eb 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala @@ -182,18 +182,16 @@ private[johnsnowlabs] class MPNet( val info = lastHiddenState.getInfo.asInstanceOf[TensorInfo] val shape = info.getShape try { - val embeddings = lastHiddenState + val flattenEmbeddings = lastHiddenState .asInstanceOf[OnnxTensor] .getFloatBuffer .array() tokenTensors.close() maskTensors.close() - val dim = shape.last.toInt - val avgPooling = LinAlg.avgPooling(embeddings, attentionMask(0), dim) - val normalizedSentenceEmbeddings = LinAlg.normalizeArray(avgPooling) - - Array(normalizedSentenceEmbeddings) + val embeddings = LinAlg.avgPooling(flattenEmbeddings, attentionMask, shape) + val normalizedEmbeddings = LinAlg.l2Normalize(embeddings) + LinAlg.denseMatrixToArray(normalizedEmbeddings) } finally if (results != null) results.close() } } diff --git a/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala b/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala index cf23c78a83427a..fc7bf4e503dbcf 100644 --- a/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala +++ b/src/main/scala/com/johnsnowlabs/ml/util/LinAlg.scala @@ -1,8 +1,9 @@ package com.johnsnowlabs.ml.util -import breeze.linalg.{DenseMatrix, tile} +import breeze.linalg.{DenseMatrix, norm, sum, tile, *} import scala.math.{sqrt, pow} + object LinAlg { object implicits { @@ -83,51 +84,192 @@ object LinAlg { score }._2 - def avgPooling(embeddings: Array[Float], attentionMask: Array[Long], dim: Int): Array[Float] = { - val expandedAttentionMask = new Array[Float](embeddings.length) - // Expand attentionMask to match the length of embeddings - var j = 0 - for (i <- embeddings.indices) { - expandedAttentionMask(i) = attentionMask(j) - j += 1 - if (j == attentionMask.length) { - j = 0 // reset j when we reach the end of attentionMask + + /** + * Performs average pooling on embeddings using an attention mask. + * + * This method takes flattened embeddings, an attention mask, and the shape of the embeddings, + * and computes the average pooling. The pooling is done by grouping the embeddings based on the + * attention mask and computing the weighted sum of these groups. The result is normalized + * by the total weight of the attention mask. + * + * @param flattenEmbeddings + * Array of flattened embeddings + * @param attentionMask + * 2D Array representing the attention mask + * @param shape + * Array representing the shape of the embeddings (dimensions) + * @return + * A DenseMatrix of floats representing the average pooled embeddings + */ + def avgPooling( + flattenEmbeddings: Array[Float], + attentionMask: Array[Array[Long]], + shape: Array[Long]): DenseMatrix[Float] = { + + val thirdDim = shape.last.toInt + val secondDim = shape(1).toInt + val embeddings = flattenEmbeddings.grouped(thirdDim).grouped(secondDim).toArray + + val embeddingsMatrix = embeddings.map(embedding => DenseMatrix(embedding: _*)) + val attentionMaskMatrix = DenseMatrix(attentionMask: _*) + val expandedAttentionMask = expandAttentionMask(embeddingsMatrix, attentionMaskMatrix) + val weightedSum = computeWeightSum(embeddingsMatrix, expandedAttentionMask) + val totalWeight = computeTotalWeight(expandedAttentionMask) + weightedSum /:/ totalWeight + } + + /** + * Expands the attention mask to match the dimensions of the token embeddings. + * + * This method is responsible for aligning the attention mask with the embeddings. It transposes the + * attention mask and then replicates its values to match the dimensionality of the token embeddings. + * The expansion is done for each slice of the embeddings, ensuring that the expanded mask has the + * same number of rows as the token embeddings and the same number of columns as the embedding dimension. + * + * @param embeddings + * Array of DenseMatrix[Float] representing the token embeddings + * @param attentionMask + * DenseMatrix[Long] representing the initial attention mask + * @return + * Array of DenseMatrix[Float] where each matrix is the expanded attention mask aligned with the + * corresponding token embeddings + */ + + private def expandAttentionMask( + embeddings: Array[DenseMatrix[Float]], + attentionMask: DenseMatrix[Long]): Array[DenseMatrix[Float]] = { + + val transposedMask = attentionMask.t + val expectedEmbeddingSize = transposedMask.rows + embeddings.map { embedding => + require(embedding.rows == expectedEmbeddingSize, + s"Embedding dimension mismatch: expected $expectedEmbeddingSize, but found ${embedding.rows}") + + val embeddingSize = embedding.cols + val expandedMask = DenseMatrix.zeros[Float](transposedMask.rows, embeddingSize) + for (i <- 0 until transposedMask.rows; j <- 0 until embeddingSize) { + expandedMask(i, j) = + transposedMask(i, 0) // Replicate the mask value across the embedding dimension } + + expandedMask } + } - val sentenceEmbeddingsMatrix = embeddings.grouped(dim).toArray - val attentionMaskMatrix = expandedAttentionMask.grouped(dim).toArray + /** + * Computes the weighted sum of embeddings based on an expanded input mask. + * + * This method applies a weight to each embedding using the corresponding expanded input mask. + * The weights are applied through element-wise multiplication of each embedding with its + * respective mask. After weighting, the method sums the embeddings across the sequence length + * dimension. The result is a DenseMatrix representing the weighted sum of the embeddings for + * each item in the batch. + * + * @param embeddings + * Array of DenseMatrix[Float] representing the embeddings for each item in the batch + * @param inputMaskExpanded + * Array of DenseMatrix[Float] representing the expanded input masks, aligned with the embeddings + * @return + * DenseMatrix[Float] where each row corresponds to the weighted sum of embeddings for an item in the batch + */ + private def computeWeightSum( + embeddings: Array[DenseMatrix[Float]], + inputMaskExpanded: Array[DenseMatrix[Float]]): DenseMatrix[Float] = { + val batchSize = embeddings.length + val embeddingDim = if (batchSize > 0) embeddings.head.cols else 0 + val resultMatrix = DenseMatrix.zeros[Float](batchSize, embeddingDim) - val elementWiseProduct = - computeElementWiseProduct(sentenceEmbeddingsMatrix, attentionMaskMatrix) - val weightedSum: Array[Float] = elementWiseProduct.transpose.map(_.sum) + for (i <- embeddings.indices) { + val weighted = embeddings(i) *:* inputMaskExpanded(i) + resultMatrix(i, ::) := sum(weighted(::, *)) + } - val sumAlongDimension2: Array[Float] = attentionMaskMatrix.transpose.map(_.sum) - // Clamp each element to a minimum value of 1e-9 - val totalWeight: Array[Float] = sumAlongDimension2.map(x => math.max(x, 1e-9.toFloat)) - computeElementWiseDivision(weightedSum, totalWeight) + resultMatrix } - private def computeElementWiseProduct( - arrayA: Array[Array[Float]], - arrayB: Array[Array[Float]]): Array[Array[Float]] = { - arrayA.zip(arrayB).map { case (row1, row2) => - row1.zip(row2).map { case (a, b) => a * b } + /** + * Computes the total weight for each embedding in the batch based on the expanded input mask. + * + * This method calculates the sum of weights for each embedding slice across the sequence length + * dimension using the expanded input mask. The result is a DenseMatrix representing the total weight + * for each embedding in the batch. To ensure numerical stability, a clamp operation is applied to + * each sum to prevent values from falling below a minimum threshold. + * + * @param inputMaskExpanded + * Array of DenseMatrix[Float] representing the expanded input masks for each item in the batch + * @param minValue + * Float representing the minimum value to clamp the weights to, defaulting to 1e-9f + * @return + * DenseMatrix[Float] where each row corresponds to the total weight of embeddings for an item in the batch + */ + private def computeTotalWeight( + inputMaskExpanded: Array[DenseMatrix[Float]], + minValue: Float = 1e-9f): DenseMatrix[Float] = { + val batchSize = inputMaskExpanded.length + val embeddingDim = if (batchSize > 0) inputMaskExpanded.head.cols else 0 + val totalWeight = DenseMatrix.zeros[Float](batchSize, embeddingDim) + + for (i <- inputMaskExpanded.indices) { + totalWeight(i, ::) := sum(inputMaskExpanded(i)(::, *)) } + + // Applying clamp operation + totalWeight.mapValues(x => math.max(x, minValue)) } - private def computeElementWiseDivision( - arrayA: Array[Float], - arrayB: Array[Float]): Array[Float] = { - arrayA.zip(arrayB).map { case (a, b) => - if (b != 0.0f) a / b else 0.0f // Avoid division by zero + /** + * Normalizes each row of a DenseMatrix using the L2 norm. + * + * This method applies L2 normalization to the embeddings. It first computes the L2 norm for each row + * (embedding) in the input matrix. Then, it creates a matrix where each row is the computed norms vector, + * ensuring the dimensions match the input embeddings. Finally, it normalizes each row in the embeddings + * by dividing by the corresponding L2 norm. + * + * The result is a DenseMatrix where each row (embedding) is L2 normalized, ensuring that embeddings have + * a consistent scale for further processing. + * + * @param embeddings + * DenseMatrix[Float] representing the embeddings to be normalized + * @return + * DenseMatrix[Float] where each row is an L2 normalized version of the corresponding row in the input matrix + */ + def l2Normalize(embeddings: DenseMatrix[Float]): DenseMatrix[Float] = { + val norms = norm(embeddings(*, ::), 2) + + // Normalize each row, avoiding division by zero + val normalized = DenseMatrix.tabulate[Float](embeddings.rows, embeddings.cols) { (i, j) => + if (norms(i) != 0) embeddings(i, j) / norms(i).toFloat else 0.0f } + + normalized } - def normalizeArray(array: Array[Float]): Array[Float] = { - val l2Norm: Float = sqrt(array.map(x => x * x).sum).toFloat - // Normalize each element in the array - array.map(value => if (l2Norm != 0.0f) value / l2Norm else 0.0f) + /** + * Converts a DenseMatrix to a 2D array of floats. + * + * This method is used to transform a DenseMatrix[Float] into a two-dimensional array. + * It iterates over the rows and columns of the DenseMatrix, copying each element into the corresponding + * position in the newly created 2D array. + * + * @param matrix + * DenseMatrix[Float] that needs to be converted to a 2D array + * @return + * An 2D array representing the same data as the input DenseMatrix + */ + def denseMatrixToArray(matrix: DenseMatrix[Float]): Array[Array[Float]] = { + val rows = matrix.rows + val cols = matrix.cols + + val array = Array.ofDim[Float](rows, cols) + + for (i <- 0 until rows) { + for (j <- 0 until cols) { + array(i)(j) = matrix(i, j) + } + } + + array } def lpNormalizeArray(array: Array[Float], p: Int = 2): Array[Float] = { diff --git a/src/test/scala/com/johnsnowlabs/ml/util/LinAlgTest.scala b/src/test/scala/com/johnsnowlabs/ml/util/LinAlgTest.scala index fbdf335e4418f3..8ad7bd0079da54 100644 --- a/src/test/scala/com/johnsnowlabs/ml/util/LinAlgTest.scala +++ b/src/test/scala/com/johnsnowlabs/ml/util/LinAlgTest.scala @@ -2,8 +2,11 @@ package com.johnsnowlabs.ml.util import breeze.linalg._ import com.johnsnowlabs.ml.util.LinAlg.implicits.ExtendedDenseMatrix +import com.johnsnowlabs.ml.util.LinAlg.{denseMatrixToArray, l2Normalize} import org.scalatest.flatspec.AnyFlatSpec -class LinAlgTest extends AnyFlatSpec { +import org.scalatest.matchers.should.Matchers + +class LinAlgTest extends AnyFlatSpec with Matchers { behavior of "LinAlgTest" @@ -54,4 +57,80 @@ class LinAlgTest extends AnyFlatSpec { val expected = DenseMatrix(Seq(2.0d, 3.0d), Seq(2.0d, 3.0d), Seq(2.0d, 3.0d), Seq(2.0d, 3.0d)) assert(ab == expected) } + + val tolerance = 1e-6f + + def assertEqualWithTolerance(actual: Array[Float], expected: Array[Float]): Unit = { + assert(actual.length == expected.length, "Array lengths differ") + for ((a, e) <- actual.zip(expected)) { + assert(math.abs(a - e) <= tolerance, s"Expected $e, got $a within tolerance $tolerance") + } + } + + "l2Normalize" should "correctly normalize a regular matrix" in { + val matrix = DenseMatrix((1.0f, 2.0f), (3.0f, 4.0f)) + val normalized = l2Normalize(matrix) + assertEqualWithTolerance(normalized(*, ::).map(norm(_, 2)).toArray.map(_.toFloat), Array(1.0f, 1.0f)) + } + + it should "handle a single row matrix" in { + val matrix = DenseMatrix((1.0f, 2.0f, 3.0f)) + val normalized = l2Normalize(matrix) + assert(math.abs(norm(normalized.toDenseVector, 2) - 1.0f) <= tolerance) + } + + it should "handle a single column matrix" in { + val matrix = DenseMatrix(1.0f, 2.0f, 3.0f) + val normalized = l2Normalize(matrix) + assertEqualWithTolerance(normalized(*, ::).map(norm(_, 2)).toArray.map(_.toFloat), Array(1.0f, 1.0f, 1.0f)) + } + + it should "handle a matrix with zero elements" in { + val matrix = DenseMatrix((0.0f, 0.0f), (0.0f, 0.0f)) + val normalized = l2Normalize(matrix) + assert(normalized === matrix) + } + + it should "normalize each row to unit length" in { + val matrix = DenseMatrix((1.0f, 0.0f), (0.0f, 1.0f)) + val normalized = l2Normalize(matrix) + assertEqualWithTolerance(normalized(*, ::).map(norm(_, 2)).toArray.map(_.toFloat), Array(1.0f, 1.0f)) + } + + it should "correctly normalize a matrix with negative values" in { + val matrix = DenseMatrix((-1.0f, -2.0f), (3.0f, -4.0f)) + val normalized = l2Normalize(matrix) + assertEqualWithTolerance(normalized(*, ::).map(norm(_, 2)).toArray.map(_.toFloat), Array(1.0f, 1.0f)) + } + + "denseMatrixToArray" should "correctly convert a regular matrix" in { + val matrix = DenseMatrix((1.0f, 2.0f), (3.0f, 4.0f)) + val array = denseMatrixToArray(matrix) + assert(array === Array(Array(1.0f, 2.0f), Array(3.0f, 4.0f))) + } + + it should "handle a single row matrix" in { + val matrix = DenseMatrix.create(1, 3, Array(1.0f, 2.0f, 3.0f)) + val array = denseMatrixToArray(matrix) + assert(array === Array(Array(1.0f, 2.0f, 3.0f))) + } + + it should "handle a single column matrix" in { + val matrix = DenseMatrix.create(3, 1, Array(1.0f, 2.0f, 3.0f)) + val array = denseMatrixToArray(matrix) + assert(array === Array(Array(1.0f), Array(2.0f), Array(3.0f))) + } + + it should "handle an empty matrix" in { + val matrix = DenseMatrix.zeros[Float](0, 0) + val array = denseMatrixToArray(matrix) + assert(array === Array[Array[Float]]()) + } + + it should "correctly convert a matrix with various values" in { + val matrix = DenseMatrix((-1.0f, 0.0f), (3.0f, -4.0f)) + val array = denseMatrixToArray(matrix) + assert(array === Array(Array(-1.0f, 0.0f), Array(3.0f, -4.0f))) + } + } diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala index 286bcd933c3e0b..7c5b2ed5d5e656 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala @@ -113,4 +113,35 @@ class E5EmbeddingsTestSpec extends AnyFlatSpec { pipelineDF.select("e5.embeddings").show(false) } + it should "not return empty embeddings" taggedAs SlowTest in { + import ResourceHelper.spark.implicits._ + val interests = Seq("I like music", "I like movies", "I like books", "I like sports", + "I like travel", "I like food", "I like games", "I like art", + "I like nature", "I like science", "I like technology", "I like history", + "I like fashion", "I like cars", "I like animals", "I like gardening") + val testDf = interests.toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val embeddings = E5Embeddings + .pretrained() + .setInputCols(Array("document")) + .setOutputCol("e5") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) + + val pipelineDF = pipeline.fit(testDf).transform(testDf) + + val embeddingsDF = pipelineDF.withColumn("embeddings", col("e5.embeddings").getItem(0)) + + val sizesArray: Array[Int] = embeddingsDF + .select(size(col("embeddings")).as("size")) + .collect() + .map(row => row.getAs[Int]("size")) + + assert(sizesArray.forall(_ > 0)) + } + } diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala index c75a9f33919e05..70c5cb1bbec893 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala @@ -107,4 +107,35 @@ class MPNetEmbeddingsTestSpec extends AnyFlatSpec { pipelineDF.select("mpnet.embeddings").show(false) } + it should "not return empty embeddings" taggedAs SlowTest in { + import ResourceHelper.spark.implicits._ + val interests = Seq("I like music", "I like movies", "I like books", "I like sports", + "I like travel", "I like food", "I like games", "I like art", + "I like nature", "I like science", "I like technology", "I like history", + "I like fashion", "I like cars", "I like animals", "I like gardening") + val testDf = interests.toDF("text") + + val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + val embeddings = MPNetEmbeddings + .pretrained() + .setInputCols("document") + .setOutputCol("mpnet") + + val pipeline = new Pipeline().setStages(Array(document, embeddings)) + + val pipelineModel = pipeline.fit(testDf) + val pipelineDF = pipelineModel.transform(testDf) + + val embeddingsDF = pipelineDF.withColumn("embeddings", col("mpnet.embeddings").getItem(0)) + val sizesArray: Array[Int] = embeddingsDF + .select(size(col("embeddings")).as("size")) + .collect() + .map(row => row.getAs[Int]("size")) + + assert(sizesArray.forall(_ > 0)) + } + } From 679a2aa32205e0ecd5885a10dcc0720c4da31bda Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Wed, 27 Dec 2023 10:46:49 -0500 Subject: [PATCH 07/14] [SPARKNLP-949] Adding changes for spark 3.5 compatibility (#14105) * [SPARKNLP-949] Adding changes for spark 3.5 compatibility * [SPARKNLP-949] Adding stage for spark 3.5 in CI pipeline * [SPARKNLP-949] Updating python version for spark 3.5 * [SPARKNLP-949] Adding changes for spark 3.5.x backward compatibility --- .github/workflows/build_and_test.yml | 40 ++++++++++++++++ project/Dependencies.scala | 15 ++++-- .../com/johnsnowlabs/nlp/AnnotatorModel.scala | 17 +++---- .../johnsnowlabs/nlp/HasBatchedAnnotate.scala | 33 ++++++++----- .../annotators/common/DatasetHelpers.scala | 4 +- .../nlp/util/SparkNlpConfig.scala | 48 +++++++++++++++++++ .../nlp/util/SparkNlpConfigKeys.scala | 24 ---------- .../nlp/annotators/ner/dl/NerDLSpec.scala | 4 +- .../symmetric/SymmetricDeleteBehaviors.scala | 6 +-- 9 files changed, 134 insertions(+), 57 deletions(-) create mode 100644 src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfig.scala delete mode 100644 src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfigKeys.scala diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a1afb5b4117065..23d84e1f82150d 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -31,6 +31,46 @@ on: - 'main' jobs: + spark35: + if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" + runs-on: macos-latest + env: + TF_CPP_MIN_LOG_LEVEL: 3 + JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" + name: Build and Test on Apache Spark 3.5.x + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-java@v3 + with: + distribution: 'adopt' + java-version: '8' + cache: 'sbt' + - name: Install Python 3.10 + uses: actions/setup-python@v2 + with: + python-version: 3.10.12 + architecture: x64 + - name: Install Python packages (Python 3.10) + run: | + python -m pip install --upgrade pip + pip install pyspark==3.5.0 numpy pytest + - name: Build Spark NLP on Apache Spark 3.5.0 + run: | + brew install sbt + sbt -mem 4096 -Dis_spark35=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.5.x + run: | + sbt -mem 4096 coverage test + - name: Upload coverage data to Coveralls + run: sbt coverageReport coveralls + env: + COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COVERALLS_FLAG_NAME: Apache Spark 3.5.x - Scala 2.12 + - name: Test Spark NLP in Python - Apache Spark 3.5.x + run: | + cd python + python3.10 -m pytest -v -m fast spark34: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest diff --git a/project/Dependencies.scala b/project/Dependencies.scala index b7d10c01b856e6..95112a2bb338b6 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -4,12 +4,13 @@ object Dependencies { /** ------- Spark version start ------- */ /* default spark version to base the APIS */ - val spark34Ver = "3.4.0" + val spark35Ver = "3.5.0" /* only used in unit tests */ val spark30Ver = "3.0.3" val spark31Ver = "3.1.3" val spark32Ver = "3.2.3" val spark33Ver = "3.3.1" + val spark34Ver = "3.4.0" /* required for different hardware */ val is_gpu: String = System.getProperty("is_gpu", "false") @@ -22,9 +23,10 @@ object Dependencies { val is_spark31: String = System.getProperty("is_spark31", "false") val is_spark32: String = System.getProperty("is_spark32", "false") val is_spark33: String = System.getProperty("is_spark33", "false") - val is_spark34: String = System.getProperty("is_spark34", "true") + val is_spark34: String = System.getProperty("is_spark34", "false") + val is_spark35: String = System.getProperty("is_spark35", "true") - val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34) + val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34, is_spark35) /** ------- Spark version end ------- */ @@ -46,7 +48,8 @@ object Dependencies { is_spark31: String, is_spark32: String, is_spark33: String, - is_spark34: String): String = { + is_spark34: String, + is_spark35: String): String = { if (is_spark30.equals("true")) { spark30Ver } else if (is_spark31.equals("true")) { @@ -55,9 +58,11 @@ object Dependencies { spark32Ver } else if (is_spark33.equals("true")) { spark33Ver + } else if (is_spark34.equals("true")) { + spark34Ver } else { /* default spark version */ - spark34Ver + spark35Ver } } diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala index 3698ad0167a94f..1a350c750fc958 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotatorModel.scala @@ -16,8 +16,9 @@ package com.johnsnowlabs.nlp +import com.johnsnowlabs.nlp.util.SparkNlpConfig import org.apache.spark.ml.{Model, PipelineModel} -import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, Row} @@ -48,7 +49,7 @@ abstract class AnnotatorModel[M <: Model[M]] extends RawAnnotator[M] with CanBeL s"${inputAnnotatorTypes.mkString(", ")}") val inputDataset = beforeAnnotate(dataset) - + val newStructType = inputDataset.schema.add(getOutputCol, Annotation.arrayType) val processedDataset = { this match { case withAnnotate: HasSimpleAnnotate[M] => @@ -64,8 +65,8 @@ abstract class AnnotatorModel[M <: Model[M]] extends RawAnnotator[M] with CanBeL } })) case withBatchAnnotate: HasBatchedAnnotate[M] => - val newStructType = inputDataset.schema.add(getOutputCol, Annotation.arrayType) - implicit val encoder: ExpressionEncoder[Row] = RowEncoder(newStructType) + implicit val encoder: ExpressionEncoder[Row] = + SparkNlpConfig.getEncoder(inputDataset, newStructType) val processedDataFrame = inputDataset.mapPartitions(partition => { withBatchAnnotate.batchProcess(partition) }) @@ -80,8 +81,8 @@ abstract class AnnotatorModel[M <: Model[M]] extends RawAnnotator[M] with CanBeL dfWithMetadata case withBatchAnnotateImage: HasBatchedAnnotateImage[M] => - val newStructType = inputDataset.schema.add(getOutputCol, Annotation.arrayType) - implicit val encoder: ExpressionEncoder[Row] = RowEncoder(newStructType) + implicit val encoder: ExpressionEncoder[Row] = + SparkNlpConfig.getEncoder(inputDataset, newStructType) val processedDataFrame = inputDataset.mapPartitions(partition => { withBatchAnnotateImage.batchProcess(partition) }) @@ -96,8 +97,8 @@ abstract class AnnotatorModel[M <: Model[M]] extends RawAnnotator[M] with CanBeL dfWithMetadata case withBatchAnnotateAudio: HasBatchedAnnotateAudio[M] => - val newStructType = inputDataset.schema.add(getOutputCol, Annotation.arrayType) - implicit val encoder: ExpressionEncoder[Row] = RowEncoder(newStructType) + implicit val encoder: ExpressionEncoder[Row] = + SparkNlpConfig.getEncoder(inputDataset, newStructType) val processedDataFrame = inputDataset.mapPartitions(partition => { withBatchAnnotateAudio.batchProcess(partition) }) diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotate.scala b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotate.scala index 8c41f84f3caacb..67f5d39d984f0d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotate.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotate.scala @@ -47,19 +47,26 @@ trait HasBatchedAnnotate[M <: Model[M]] { def getBatchSize: Int = $(batchSize) def batchProcess(rows: Iterator[_]): Iterator[Row] = { - // TODO remove the @unchecked annotation and create a type to handle different subtypes - rows - .grouped(getBatchSize) - .flatMap { case batchedRows: Seq[Row @unchecked] => - val inputAnnotations = batchedRows.map(row => { - getInputCols.flatMap(inputCol => { - row.getAs[Seq[Row]](inputCol).map(Annotation(_)) - }) - }) - val outputAnnotations = batchAnnotate(inputAnnotations) - batchedRows.zip(outputAnnotations).map { case (row, annotations) => - row.toSeq ++ Array(annotations.map(a => Row(a.productIterator.toSeq: _*))) - } + val groupedRows = rows.grouped(getBatchSize) + + groupedRows.flatMap { + case batchRow: Seq[Row] => processBatchRows(batchRow) + case singleRow: Row => processBatchRows(Seq(singleRow)) + case _ => Seq(Row.empty) + } + } + + private def processBatchRows(batchedRows: Seq[Row]): Seq[Row] = { + val inputAnnotations = batchedRows.map(row => { + getInputCols.flatMap(inputCol => { + row.getAs[Seq[Row]](inputCol).map(Annotation(_)) + }) + }) + val outputAnnotations = batchAnnotate(inputAnnotations) + batchedRows + .zip(outputAnnotations) + .map { case (row, annotations) => + row.toSeq ++ Array(annotations.map(a => Row(a.productIterator.toSeq: _*))) } .map(Row.fromSeq) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DatasetHelpers.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DatasetHelpers.scala index dfca5b0aeea9ed..842b3c5251a754 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DatasetHelpers.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/common/DatasetHelpers.scala @@ -18,8 +18,8 @@ package com.johnsnowlabs.nlp.annotators.common import com.johnsnowlabs.ml.crf.TextSentenceLabels import com.johnsnowlabs.ml.tensorflow.SentenceGrouper +import com.johnsnowlabs.nlp.util.SparkNlpConfig import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.catalyst.encoders.RowEncoder import scala.reflect.ClassTag @@ -27,7 +27,7 @@ object DatasetHelpers { implicit class DataFrameHelper(dataset: DataFrame) { def randomize: DataFrame = { - implicit val encoder = RowEncoder(dataset.schema) + implicit val encoder = SparkNlpConfig.getEncoder(dataset, dataset.schema) dataset.mapPartitions { new scala.util.Random().shuffle(_).toIterator } diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfig.scala b/src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfig.scala new file mode 100644 index 00000000000000..dc7cc29209166d --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfig.scala @@ -0,0 +1,48 @@ +/* + * Copyright 2017-2022 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.johnsnowlabs.nlp.util + +import com.johnsnowlabs.util.Version +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{Dataset, Row} + +/** Additional configure options that used by spark.nlp */ +object SparkNlpConfig { + + def getEncoder(inputDataset: Dataset[_], newStructType: StructType): ExpressionEncoder[Row] = { + val sparkVersion = Version.parse(inputDataset.sparkSession.version).toFloat + if (sparkVersion >= 3.5f) { + val expressionEncoderClass = + Class.forName("org.apache.spark.sql.catalyst.encoders.ExpressionEncoder") + val applyMethod = expressionEncoderClass.getMethod("apply", classOf[StructType]) + applyMethod.invoke(null, newStructType).asInstanceOf[ExpressionEncoder[Row]] + } else { + try { + // Use reflection to access RowEncoder.apply in older Spark versions + val rowEncoderClass = Class.forName("org.apache.spark.sql.catalyst.encoders.RowEncoder") + val applyMethod = rowEncoderClass.getMethod("apply", classOf[StructType]) + applyMethod.invoke(null, newStructType).asInstanceOf[ExpressionEncoder[Row]] + } catch { + case _: Throwable => + throw new UnsupportedOperationException( + "RowEncoder.apply is not supported in this Spark version.") + } + } + } + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfigKeys.scala b/src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfigKeys.scala deleted file mode 100644 index 83ec541374f025..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/util/SparkNlpConfigKeys.scala +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2017-2022 John Snow Labs - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.johnsnowlabs.nlp.util - -/** Additional configure options that used by spark.nlp */ -object SparkNlpConfigKeys { - - /** Folder to store word embeddings */ - val embeddingsFolder = "sparknlp.embeddings.folder" -} diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLSpec.scala index a4d41b68f914da..a899f40db5df32 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/ner/dl/NerDLSpec.scala @@ -316,10 +316,10 @@ class NerDLSpec extends AnyFlatSpec { val training_data = conll.readDataset(ResourceHelper.spark, "src/test/resources/conll2003/eng.train") - val embeddings = WordEmbeddingsModel.pretrained() + val embeddings = WordEmbeddingsModel.pretrained("glove_100d") val nerModel = NerDLModel - .pretrained() + .pretrained("ner_dl", "en") .setInputCols("sentence", "token", "embeddings") .setOutputCol("ner") .setIncludeConfidence(true) diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala index 77bd4108f5b8ee..b1f506ede51a38 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/spell/symmetric/SymmetricDeleteBehaviors.scala @@ -166,10 +166,10 @@ trait SymmetricDeleteBehaviors { when(col("word") === col("finished_spell"), 1).otherwise(0)) val rightCorrections = correctedData.filter(col("prediction") === 1).count() val wrongCorrections = correctedData.filter(col("prediction") === 0).count() - printf("Right Corrections: %d \n", rightCorrections) - printf("Wrong Corrections: %d \n", wrongCorrections) + println("Right Corrections: %d \n", rightCorrections) + println("Wrong Corrections: %d \n", wrongCorrections) val accuracy = rightCorrections.toFloat / (rightCorrections + wrongCorrections).toFloat - printf("Accuracy: %f\n", accuracy) + println("Accuracy: %f\n", accuracy) } } } From c04009c7c1bbabc22a2863227d050951f9b11e58 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Wed, 27 Dec 2023 10:47:47 -0500 Subject: [PATCH 08/14] [SPARKNLP-961] Adding ONNX configs to README (#14111) --- README.md | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 9c7eac3bd0e865..b44134970b833e 100644 --- a/README.md +++ b/README.md @@ -915,16 +915,20 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ You can change the following Spark NLP configurations via Spark Configuration: -| Property Name | Default | Meaning | -|--------------------------------------------------------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `spark.jsl.settings.pretrained.cache_folder` | `~/cache_pretrained` | The location to download and extract pretrained `Models` and `Pipelines`. By default, it will be in User's Home directory under `cache_pretrained` directory | -| `spark.jsl.settings.storage.cluster_tmp_dir` | `hadoop.tmp.dir` | The location to use on a cluster for temporarily files such as unpacking indexes for WordEmbeddings. By default, this locations is the location of `hadoop.tmp.dir` set via Hadoop configuration for Apache Spark. NOTE: `S3` is not supported and it must be local, HDFS, or DBFS | -| `spark.jsl.settings.annotator.log_folder` | `~/annotator_logs` | The location to save logs from annotators during training such as `NerDLApproach`, `ClassifierDLApproach`, `SentimentDLApproach`, `MultiClassifierDLApproach`, etc. By default, it will be in User's Home directory under `annotator_logs` directory | -| `spark.jsl.settings.aws.credentials.access_key_id` | `None` | Your AWS access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | -| `spark.jsl.settings.aws.credentials.secret_access_key` | `None` | Your AWS secret access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | -| `spark.jsl.settings.aws.credentials.session_token` | `None` | Your AWS MFA session token to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | -| `spark.jsl.settings.aws.s3_bucket` | `None` | Your AWS S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | -| `spark.jsl.settings.aws.region` | `None` | Your AWS region to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| Property Name | Default | Meaning | +|---------------------------------------------------------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `spark.jsl.settings.pretrained.cache_folder` | `~/cache_pretrained` | The location to download and extract pretrained `Models` and `Pipelines`. By default, it will be in User's Home directory under `cache_pretrained` directory | +| `spark.jsl.settings.storage.cluster_tmp_dir` | `hadoop.tmp.dir` | The location to use on a cluster for temporarily files such as unpacking indexes for WordEmbeddings. By default, this locations is the location of `hadoop.tmp.dir` set via Hadoop configuration for Apache Spark. NOTE: `S3` is not supported and it must be local, HDFS, or DBFS | +| `spark.jsl.settings.annotator.log_folder` | `~/annotator_logs` | The location to save logs from annotators during training such as `NerDLApproach`, `ClassifierDLApproach`, `SentimentDLApproach`, `MultiClassifierDLApproach`, etc. By default, it will be in User's Home directory under `annotator_logs` directory | +| `spark.jsl.settings.aws.credentials.access_key_id` | `None` | Your AWS access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.credentials.secret_access_key` | `None` | Your AWS secret access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.credentials.session_token` | `None` | Your AWS MFA session token to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.s3_bucket` | `None` | Your AWS S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.region` | `None` | Your AWS region to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.onnx.gpuDeviceId` | `0` | Constructs CUDA execution provider options for the specified non-negative device id. | +| `spark.jsl.settings.onnx.intraOpNumThreads` | `6` | Sets the size of the CPU thread pool used for executing a single graph, if executing on a CPU. | +| `spark.jsl.settings.onnx.optimizationLevel` | `ALL_OPT` | Sets the optimization level of this options object, overriding the old setting. | +| `spark.jsl.settings.onnx.executionMode` | `SEQUENTIAL` | Sets the execution mode of this options object, overriding the old setting. | ### How to set Spark NLP Configuration From 7cf39a041510cfd1c1f3dfb2ab9cb9611ecfdd14 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 27 Dec 2023 18:23:37 +0100 Subject: [PATCH 09/14] Make spark 3.4 default [skip test] --- project/Dependencies.scala | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 95112a2bb338b6..c20fd3f27020de 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -4,13 +4,13 @@ object Dependencies { /** ------- Spark version start ------- */ /* default spark version to base the APIS */ - val spark35Ver = "3.5.0" + val spark34Ver = "3.4.0" /* only used in unit tests */ val spark30Ver = "3.0.3" val spark31Ver = "3.1.3" val spark32Ver = "3.2.3" val spark33Ver = "3.3.1" - val spark34Ver = "3.4.0" + val spark35Ver = "3.5.0" /* required for different hardware */ val is_gpu: String = System.getProperty("is_gpu", "false") @@ -23,10 +23,11 @@ object Dependencies { val is_spark31: String = System.getProperty("is_spark31", "false") val is_spark32: String = System.getProperty("is_spark32", "false") val is_spark33: String = System.getProperty("is_spark33", "false") - val is_spark34: String = System.getProperty("is_spark34", "false") - val is_spark35: String = System.getProperty("is_spark35", "true") + val is_spark34: String = System.getProperty("is_spark34", "true") + val is_spark35: String = System.getProperty("is_spark35", "false") - val sparkVer: String = getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34, is_spark35) + val sparkVer: String = + getSparkVersion(is_spark30, is_spark31, is_spark32, is_spark33, is_spark34, is_spark35) /** ------- Spark version end ------- */ @@ -58,11 +59,11 @@ object Dependencies { spark32Ver } else if (is_spark33.equals("true")) { spark33Ver - } else if (is_spark34.equals("true")) { - spark34Ver + } else if (is_spark35.equals("true")) { + spark35Ver } else { /* default spark version */ - spark35Ver + spark34Ver } } @@ -123,5 +124,6 @@ object Dependencies { val azureStorageVersion = "12.22.2" val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided val azureStorage = "com.azure" % "azure-storage-blob" % "12.22.2" % Provided + /** ------- Dependencies end ------- */ } From af68f769917c222ecc85a0fd6b69a80c97361b90 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 27 Dec 2023 18:23:51 +0100 Subject: [PATCH 10/14] remove old GA workflows [skip test] --- .github/workflows/build_and_test.yml | 148 ++++----------------------- 1 file changed, 18 insertions(+), 130 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 23d84e1f82150d..a27b26887bde26 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -31,46 +31,6 @@ on: - 'main' jobs: - spark35: - if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" - runs-on: macos-latest - env: - TF_CPP_MIN_LOG_LEVEL: 3 - JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.5.x - - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-java@v3 - with: - distribution: 'adopt' - java-version: '8' - cache: 'sbt' - - name: Install Python 3.10 - uses: actions/setup-python@v2 - with: - python-version: 3.10.12 - architecture: x64 - - name: Install Python packages (Python 3.10) - run: | - python -m pip install --upgrade pip - pip install pyspark==3.5.0 numpy pytest - - name: Build Spark NLP on Apache Spark 3.5.0 - run: | - brew install sbt - sbt -mem 4096 -Dis_spark35=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.5.x - run: | - sbt -mem 4096 coverage test - - name: Upload coverage data to Coveralls - run: sbt coverageReport coveralls - env: - COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COVERALLS_FLAG_NAME: Apache Spark 3.5.x - Scala 2.12 - - name: Test Spark NLP in Python - Apache Spark 3.5.x - run: | - cd python - python3.10 -m pytest -v -m fast spark34: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest @@ -111,14 +71,13 @@ jobs: run: | cd python python3.7 -m pytest -v -m fast - - spark33: + spark35: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest env: TF_CPP_MIN_LOG_LEVEL: 3 JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.3.x + name: Build and Test on Apache Spark 3.5.x steps: - uses: actions/checkout@v3 @@ -127,34 +86,34 @@ jobs: distribution: 'adopt' java-version: '8' cache: 'sbt' - - name: Install Python 3.7 + - name: Install Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.7.7 + python-version: 3.10.12 architecture: x64 - - name: Install Python packages (Python 3.7) + - name: Install Python packages (Python 3.10) run: | python -m pip install --upgrade pip - pip install pyspark==3.3.1 numpy pytest - - name: Build Spark NLP on Apache Spark 3.3.1 + pip install pyspark==3.5.0 numpy pytest + - name: Build Spark NLP on Apache Spark 3.5.0 run: | brew install sbt - sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.3.x + sbt -mem 4096 -Dis_spark35=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.5.x run: | sbt -mem 4096 test - - name: Test Spark NLP in Python - Apache Spark 3.3.x + - name: Test Spark NLP in Python - Apache Spark 3.5.x run: | cd python - python3.7 -m pytest -v -m fast + python3.10 -m pytest -v -m fast - spark32: + spark33: if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" runs-on: macos-latest env: TF_CPP_MIN_LOG_LEVEL: 3 JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - name: Build and Test on Apache Spark 3.2.x + name: Build and Test on Apache Spark 3.3.x steps: - uses: actions/checkout@v3 @@ -171,87 +130,16 @@ jobs: - name: Install Python packages (Python 3.7) run: | python -m pip install --upgrade pip - pip install pyspark==3.2.3 numpy pytest - - name: Build Spark NLP on Apache Spark 3.2.3 + pip install pyspark==3.3.1 numpy pytest + - name: Build Spark NLP on Apache Spark 3.3.1 run: | brew install sbt - sbt -mem 4096 -Dis_spark32=true clean assemblyAndCopy - - name: Test Spark NLP in Scala - Apache Spark 3.2.x + sbt -mem 4096 -Dis_spark33=true clean assemblyAndCopy + - name: Test Spark NLP in Scala - Apache Spark 3.3.x run: | sbt -mem 4096 test - - name: Test Spark NLP in Python - Apache Spark 3.2.x + - name: Test Spark NLP in Python - Apache Spark 3.3.x run: | cd python python3.7 -m pytest -v -m fast - # spark31: - # if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" - # runs-on: macos-latest - # env: - # TF_CPP_MIN_LOG_LEVEL: 3 - # JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - # name: Build and Test on Apache Spark 3.1.x - - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-java@v3 - # with: - # distribution: 'adopt' - # java-version: '8' - # cache: 'sbt' - # - name: Install Python 3.7 - # uses: actions/setup-python@v2 - # with: - # python-version: 3.7.7 - # architecture: x64 - # - name: Install Python packages (Python 3.7) - # run: | - # python -m pip install --upgrade pip - # pip install pyspark==3.1.3 numpy pytest - # - name: Build Spark NLP on Apache Spark 3.1.x - # run: | - # brew install sbt - # sbt -mem 4096 -Dis_spark31=true clean assemblyAndCopy - # - name: Test Spark NLP in Scala - Apache Spark 3.1.x - # run: | - # sbt -mem 4096 test - # - name: Test Spark NLP in Python - Apache Spark 3.1.x - # run: | - # cd python - # python3.7 -m pytest -v -m fast - - # spark30: - # if: "! contains(toJSON(github.event.commits.*.message), '[skip test]')" - # runs-on: macos-latest - # env: - # TF_CPP_MIN_LOG_LEVEL: 3 - # JAVA_OPTS: "-Xmx4096m -XX:+UseG1GC" - # name: Build and Test on Apache Spark 3.0.x - - # steps: - # - uses: actions/checkout@v3 - # - uses: actions/setup-java@v3 - # with: - # distribution: 'adopt' - # java-version: '8' - # cache: 'sbt' - # - name: Install Python 3.7 - # uses: actions/setup-python@v2 - # with: - # python-version: 3.7.7 - # architecture: x64 - # - name: Install Python packages (Python 3.7) - # run: | - # python -m pip install --upgrade pip - # pip install pyspark==3.0.3 numpy pytest - # - name: Build Spark NLP on Apache Spark 3.0.x - # run: | - # brew install sbt - # sbt -mem 4096 -Dis_spark30=true clean assemblyAndCopy - # - name: Test Spark NLP in Scala - Apache Spark 3.0.x - # run: | - # sbt -mem 4096 test - # - name: Test Spark NLP in Python - Apache Spark 3.0.x - # run: | - # cd python - # python3.7 -m pytest -v -m fast \ No newline at end of file From 64ecc94abfe0d949253efd586b991616d1c4f417 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Wed, 27 Dec 2023 19:28:04 +0100 Subject: [PATCH 11/14] Update README and docs [run doc] --- CHANGELOG | 24 + README.md | 88 +- build.sbt | 2 +- docs/README.md | 1334 ++++++++++++++++- docs/_layouts/landing.html | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 54 +- docs/en/spark_nlp.md | 2 +- python/README.md | 88 +- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 19 files changed, 1475 insertions(+), 145 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 874c7e884f9ab8..bc98de8e1b39bc 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,27 @@ +======== +5.2.1 +======== +---------------- +New Features & Enhancements +---------------- +* Add support for Spark and PySpark 3.5 major release +* Support Databricks Runtimes of 14.0, 14.1, 14.2, 14.0 ML, 14.1 ML, 14.2 ML, 14.0 GPU, 14.1 GPU, and 14.2 GPU +* **NEW:** Introducing the `BGEEmbeddings` annotator for Spark NLP. This annotator enables the integration of `BGE` models, based on the BERT architecture, into Spark NLP. The `BGEEmbeddings` annotator is designed for generating dense vectors suitable for a variety of applications, including `retrieval`, `classification`, `clustering`, and `semantic search`. Additionally, it is compatible with `vector databases` used in `Large Language Models (LLMs)`. +* **NEW:** Introducing support for ONNX Runtime in DeBertaForTokenClassification annotator +* **NEW:** Introducing support for ONNX Runtime in DeBertaForSequenceClassification annotator +* **NEW:** Introducing support for ONNX Runtime in DeBertaForQuestionAnswering annotator +* Add a new notebook to show how to import any model from `T5` family into Spark NLP with TensorFlow format +* Add a new notebook to show how to import any model from `T5` family into Spark NLP with ONNX format +* Add a new notebook to show how to import any model from `MarianNMT` family into Spark NLP with ONNX format + + +---------------- +Bug Fixes +---------------- +* Fix serialization issue in `DocumentTokenSplitter` annotator failing to be saved and loaded in a Pipeline +* Fix serialization issue in `DocumentCharacterTextSplitter` annotator failing to be saved and loaded in a Pipeline + + ======== 5.2.0 ======== diff --git a/README.md b/README.md index b44134970b833e..eec41863cc3125 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.2.0 is built with ONNX 1.16.3 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.2.1 is built with ONNX 1.16.3 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -189,7 +189,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -234,7 +234,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.2.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x +Spark NLP *5.2.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -276,7 +276,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.2.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.2.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -343,7 +343,7 @@ Spark NLP 5.2.0 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.2.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.2.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -390,11 +390,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` The `spark-nlp` has been published to @@ -403,11 +403,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 ``` @@ -417,11 +417,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 ``` @@ -431,11 +431,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 ``` @@ -449,7 +449,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` ## Scala @@ -467,7 +467,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.2.0 + 5.2.1 ``` @@ -478,7 +478,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.2.0 + 5.2.1 ``` @@ -489,7 +489,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.2.0 + 5.2.1 ``` @@ -500,7 +500,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.2.0 + 5.2.1 ``` @@ -510,28 +510,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.1" ``` Maven @@ -553,7 +553,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.2.0 +pip install spark-nlp==5.2.1 ``` Conda: @@ -582,7 +582,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1") .getOrCreate() ``` @@ -653,7 +653,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -664,7 +664,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.2.0 +pip install spark-nlp==5.2.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -692,7 +692,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -709,7 +709,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -736,7 +736,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -759,7 +759,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -778,9 +778,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.2.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.2.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -831,7 +831,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1" } }] ``` @@ -840,7 +840,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.2.0" \ +--name "Spark NLP 5.2.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -904,7 +904,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -947,7 +947,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1") .getOrCreate() ``` @@ -961,7 +961,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` **pyspark:** @@ -974,7 +974,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` **Databricks:** @@ -1246,7 +1246,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.0.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.1.jar") .getOrCreate() ``` @@ -1255,7 +1255,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.0.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index e044783b44ac20..21016721060f74 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.2.0" +version := "5.2.1" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/README.md b/docs/README.md index f18e86a451ac37..eec41863cc3125 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,26 +1,1332 @@ -# Spark NLP Documentation +# Spark NLP: State-of-the-Art Natural Language Processing & LLMs Library -We welcome you to contribute to Spark NLP documentation hosted inside `en/` directory. All the files are in Markdown format. +

+ + + + + + + + + + + + + + +

-## Development +Spark NLP is a state-of-the-art Natural Language Processing library built on top of Apache Spark. It provides **simple**, **performant** & **accurate** NLP annotations for machine learning pipelines that **scale** easily in a distributed +environment. +Spark NLP comes with **30000+** pretrained **pipelines** and **models** in more than **200+** languages. +It also offers tasks such as **Tokenization**, **Word Segmentation**, **Part-of-Speech Tagging**, Word and Sentence **Embeddings**, **Named Entity Recognition**, **Dependency Parsing**, **Spell Checking**, **Text Classification**, **Sentiment Analysis**, **Token Classification**, **Machine Translation** (+180 languages), **Summarization**, **Question Answering**, **Table Question Answering**, **Text Generation**, **Image Classification**, **Image to Text (captioning)**, **Automatic Speech Recognition**, **Zero-Shot Learning**, and many more [NLP tasks](#features). -For development purposes, you need to have `bundle` and `Gem` installed on your system. Please run these commands: +**Spark NLP** is the only open-source NLP library in **production** that offers state-of-the-art transformers such as **BERT**, **CamemBERT**, **ALBERT**, **ELECTRA**, **XLNet**, **DistilBERT**, **RoBERTa**, **DeBERTa**, **XLM-RoBERTa**, **Longformer**, **ELMO**, **Universal Sentence Encoder**, **Facebook BART**, **Instructor**, **E5**, **Google T5**, **MarianMT**, **OpenAI GPT2**, and **Vision Transformers (ViT)** not only to **Python** and **R**, but also to **JVM** ecosystem (**Java**, **Scala**, and **Kotlin**) at **scale** by extending **Apache Spark** natively. + +## Project's website + +Take a look at our official Spark NLP page: [https://sparknlp.org/](https://sparknlp.org/) for user +documentation and examples + +## Community support + +- [Slack](https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q) For live discussion with the Spark NLP community and the team +- [GitHub](https://github.com/JohnSnowLabs/spark-nlp) Bug reports, feature requests, and contributions +- [Discussions](https://github.com/JohnSnowLabs/spark-nlp/discussions) Engage with other community members, share ideas, + and show off how you use Spark NLP! +- [Medium](https://medium.com/spark-nlp) Spark NLP articles +- [YouTube](https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos) Spark NLP video tutorials + +## Table of contents + +- [Features](#features) +- [Requirements](#requirements) +- [Quick Start](#quick-start) +- [Apache Spark Support](#apache-spark-support) +- [Scala & Python Support](#scala-and-python-support) +- [Databricks Support](#databricks-support) +- [EMR Support](#emr-support) +- [Using Spark NLP](#usage) + - [Packages Cheatsheet](#packages-cheatsheet) + - [Spark Packages](#spark-packages) + - [Scala](#scala) + - [Maven](#maven) + - [SBT](#sbt) + - [Python](#python) + - [Pip/Conda](#pipconda) + - [Compiled JARs](#compiled-jars) + - [Apache Zeppelin](#apache-zeppelin) + - [Jupyter Notebook](#jupyter-notebook-python) + - [Google Colab Notebook](#google-colab-notebook) + - [Kaggle Kernel](#kaggle-kernel) + - [Databricks Cluster](#databricks-cluster) + - [EMR Cluster](#emr-cluster) + - [GCP Dataproc](#gcp-dataproc) + - [Spark NLP Configuration](#spark-nlp-configuration) +- [Pipelines & Models](#pipelines-and-models) + - [Pipelines](#pipelines) + - [Models](#models) +- [Offline](#offline) +- [Examples](#examples) +- [FAQ](#faq) +- [Citation](#citation) +- [Contributing](#contributing) + +## Features + +- Tokenization +- Trainable Word Segmentation +- Stop Words Removal +- Token Normalizer +- Document Normalizer +- Document & Text Splitter +- Stemmer +- Lemmatizer +- NGrams +- Regex Matching +- Text Matching +- Chunking +- Date Matcher +- Sentence Detector +- Deep Sentence Detector (Deep learning) +- Dependency parsing (Labeled/unlabeled) +- SpanBertCorefModel (Coreference Resolution) +- Part-of-speech tagging +- Sentiment Detection (ML models) +- Spell Checker (ML and DL models) +- Word Embeddings (GloVe and Word2Vec) +- Doc2Vec (based on Word2Vec) +- BERT Embeddings (TF Hub & HuggingFace models) +- DistilBERT Embeddings (HuggingFace models) +- CamemBERT Embeddings (HuggingFace models) +- RoBERTa Embeddings (HuggingFace models) +- DeBERTa Embeddings (HuggingFace v2 & v3 models) +- XLM-RoBERTa Embeddings (HuggingFace models) +- Longformer Embeddings (HuggingFace models) +- ALBERT Embeddings (TF Hub & HuggingFace models) +- XLNet Embeddings +- ELMO Embeddings (TF Hub models) +- Universal Sentence Encoder (TF Hub models) +- BERT Sentence Embeddings (TF Hub & HuggingFace models) +- RoBerta Sentence Embeddings (HuggingFace models) +- XLM-RoBerta Sentence Embeddings (HuggingFace models) +- Instructor Embeddings (HuggingFace models) +- E5 Embeddings (HuggingFace models) +- MPNet Embeddings (HuggingFace models) +- OpenAI Embeddings +- Sentence Embeddings +- Chunk Embeddings +- Unsupervised keywords extraction +- Language Detection & Identification (up to 375 languages) +- Multi-class Sentiment analysis (Deep learning) +- Multi-label Sentiment analysis (Deep learning) +- Multi-class Text Classification (Deep learning) +- BERT for Token & Sequence Classification +- DistilBERT for Token & Sequence Classification +- CamemBERT for Token & Sequence Classification +- ALBERT for Token & Sequence Classification +- RoBERTa for Token & Sequence Classification +- DeBERTa for Token & Sequence Classification +- XLM-RoBERTa for Token & Sequence Classification +- XLNet for Token & Sequence Classification +- Longformer for Token & Sequence Classification +- BERT for Token & Sequence Classification +- BERT for Question Answering +- CamemBERT for Question Answering +- DistilBERT for Question Answering +- ALBERT for Question Answering +- RoBERTa for Question Answering +- DeBERTa for Question Answering +- XLM-RoBERTa for Question Answering +- Longformer for Question Answering +- Table Question Answering (TAPAS) +- Zero-Shot NER Model +- Zero-Shot Text Classification by Transformers (ZSL) +- Neural Machine Translation (MarianMT) +- Text-To-Text Transfer Transformer (Google T5) +- Generative Pre-trained Transformer 2 (OpenAI GPT2) +- Seq2Seq for NLG, Translation, and Comprehension (Facebook BART) +- Vision Transformer (Google ViT) +- Swin Image Classification (Microsoft Swin Transformer) +- ConvNext Image Classification (Facebook ConvNext) +- Vision Encoder Decoder for image-to-text like captioning +- Zero-Shot Image Classification by OpenAI's CLIP +- Automatic Speech Recognition (Wav2Vec2) +- Automatic Speech Recognition (HuBERT) +- Automatic Speech Recognition (OpenAI Whisper) +- Named entity recognition (Deep learning) +- Easy ONNX and TensorFlow integrations +- GPU Support +- Full integration with Spark ML functions +- +24000 pre-trained models in +200 languages! +- +6000 pre-trained pipelines in +200 languages! +- Multi-lingual NER models: Arabic, Bengali, Chinese, Danish, Dutch, English, Finnish, French, German, Hebrew, Italian, + Japanese, Korean, Norwegian, Persian, Polish, Portuguese, Russian, Spanish, Swedish, Urdu, and more. + +## Requirements + +To use Spark NLP you need the following requirements: + +- Java 8 and 11 +- Apache Spark 3.5.x, 3.4.x, 3.3.x, 3.2.x, 3.1.x, 3.0.x + +**GPU (optional):** + +Spark NLP 5.2.1 is built with ONNX 1.16.3 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: + +- NVIDIA® GPU drivers version 450.80.02 or higher +- CUDA® Toolkit 11.2 +- cuDNN SDK 8.1.0 + +## Quick Start + +This is a quick example of how to use Spark NLP pre-trained pipeline in Python and PySpark: + +```sh +$ java -version +# should be Java 8 or 11 (Oracle or OpenJDK) +$ conda create -n sparknlp python=3.7 -y +$ conda activate sparknlp +# spark-nlp by default is based on pyspark 3.x +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 +``` + +In Python console or Jupyter `Python3` kernel: + +```python +# Import Spark NLP +from sparknlp.base import * +from sparknlp.annotator import * +from sparknlp.pretrained import PretrainedPipeline +import sparknlp + +# Start SparkSession with Spark NLP +# start() functions has 3 parameters: gpu, apple_silicon, and memory +# sparknlp.start(gpu=True) will start the session with GPU support +# sparknlp.start(apple_silicon=True) will start the session with macOS M1 & M2 support +# sparknlp.start(memory="16G") to change the default driver memory in SparkSession +spark = sparknlp.start() + +# Download a pre-trained pipeline +pipeline = PretrainedPipeline('explain_document_dl', lang='en') + +# Your testing dataset +text = """ +The Mona Lisa is a 16th century oil painting created by Leonardo. +It's held at the Louvre in Paris. +""" + +# Annotate your testing dataset +result = pipeline.annotate(text) + +# What's in the pipeline +list(result.keys()) +Output: ['entities', 'stem', 'checked', 'lemma', 'document', + 'pos', 'token', 'ner', 'embeddings', 'sentence'] + +# Check the results +result['entities'] +Output: ['Mona Lisa', 'Leonardo', 'Louvre', 'Paris'] +``` + +For more examples, you can visit our dedicated [examples](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples) to showcase all Spark NLP use cases! + +## Apache Spark Support + +Spark NLP *5.2.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x + +| Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | +|-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| +| 5.2.x | Partially | YES | YES | YES | YES | YES | NO | NO | +| 5.1.x | Partially | YES | YES | YES | YES | YES | NO | NO | +| 5.0.x | YES | YES | YES | YES | YES | YES | NO | NO | +| 4.4.x | YES | YES | YES | YES | YES | YES | NO | NO | +| 4.3.x | NO | NO | YES | YES | YES | YES | NO | NO | +| 4.2.x | NO | NO | YES | YES | YES | YES | NO | NO | +| 4.1.x | NO | NO | YES | YES | YES | YES | NO | NO | +| 4.0.x | NO | NO | YES | YES | YES | YES | NO | NO | +| 3.4.x | NO | NO | N/A | Partially | YES | YES | YES | YES | +| 3.3.x | NO | NO | NO | NO | YES | YES | YES | YES | +| 3.2.x | NO | NO | NO | NO | YES | YES | YES | YES | +| 3.1.x | NO | NO | NO | NO | YES | YES | YES | YES | +| 3.0.x | NO | NO | NO | NO | YES | YES | YES | YES | +| 2.7.x | NO | NO | NO | NO | NO | NO | YES | YES | + +Find out more about `Spark NLP` versions from our [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases). + +## Scala and Python Support + +| Spark NLP | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10| Scala 2.11 | Scala 2.12 | +|-----------|------------|------------|------------|------------|------------|------------|------------| +| 5.2.x | NO | YES | YES | YES | YES | NO | YES | +| 5.1.x | NO | YES | YES | YES | YES | NO | YES | +| 5.0.x | NO | YES | YES | YES | YES | NO | YES | +| 4.4.x | NO | YES | YES | YES | YES | NO | YES | +| 4.3.x | YES | YES | YES | YES | YES | NO | YES | +| 4.2.x | YES | YES | YES | YES | YES | NO | YES | +| 4.1.x | YES | YES | YES | YES | NO | NO | YES | +| 4.0.x | YES | YES | YES | YES | NO | NO | YES | +| 3.4.x | YES | YES | YES | YES | NO | YES | YES | +| 3.3.x | YES | YES | YES | NO | NO | YES | YES | +| 3.2.x | YES | YES | YES | NO | NO | YES | YES | +| 3.1.x | YES | YES | YES | NO | NO | YES | YES | +| 3.0.x | YES | YES | YES | NO | NO | YES | YES | +| 2.7.x | YES | YES | NO | NO | NO | YES | NO | + +## Databricks Support + +Spark NLP 5.2.1 has been tested and is compatible with the following runtimes: + +**CPU:** + +- 9.1 +- 9.1 ML +- 10.1 +- 10.1 ML +- 10.2 +- 10.2 ML +- 10.3 +- 10.3 ML +- 10.4 +- 10.4 ML +- 10.5 +- 10.5 ML +- 11.0 +- 11.0 ML +- 11.1 +- 11.1 ML +- 11.2 +- 11.2 ML +- 11.3 +- 11.3 ML +- 12.0 +- 12.0 ML +- 12.1 +- 12.1 ML +- 12.2 +- 12.2 ML +- 13.0 +- 13.0 ML +- 13.1 +- 13.1 ML +- 13.2 +- 13.2 ML +- 13.3 +- 13.3 ML +- 14.0 +- 14.0 ML +- 14.1 +- 14.1 ML + +**GPU:** + +- 9.1 ML & GPU +- 10.1 ML & GPU +- 10.2 ML & GPU +- 10.3 ML & GPU +- 10.4 ML & GPU +- 10.5 ML & GPU +- 11.0 ML & GPU +- 11.1 ML & GPU +- 11.2 ML & GPU +- 11.3 ML & GPU +- 12.0 ML & GPU +- 12.1 ML & GPU +- 12.2 ML & GPU +- 13.0 ML & GPU +- 13.1 ML & GPU +- 13.2 ML & GPU +- 13.3 ML & GPU +- 14.0 ML & GPU +- 14.1 ML & GPU + +## EMR Support + +Spark NLP 5.2.1 has been tested and is compatible with the following EMR releases: + +- emr-6.2.0 +- emr-6.3.0 +- emr-6.3.1 +- emr-6.4.0 +- emr-6.5.0 +- emr-6.6.0 +- emr-6.7.0 +- emr-6.8.0 +- emr-6.9.0 +- emr-6.10.0 +- emr-6.11.0 +- emr-6.12.0 +- emr-6.13.0 +- emr-6.14.0 + +Full list of [Amazon EMR 6.x releases](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-release-6x.html) + +NOTE: The EMR 6.1.0 and 6.1.1 are not supported. + +## Usage + +## Packages Cheatsheet + +This is a cheatsheet for corresponding Spark NLP Maven package to Apache Spark / PySpark major version: + +| Apache Spark | Spark NLP on CPU | Spark NLP on GPU | Spark NLP on AArch64 (linux) | Spark NLP on Apple Silicon | +|-------------------------|--------------------|----------------------------|--------------------------------|--------------------------------------| +| 3.0/3.1/3.2/3.3/3.4/3.5 | `spark-nlp` | `spark-nlp-gpu` | `spark-nlp-aarch64` | `spark-nlp-silicon` | +| Start Function | `sparknlp.start()` | `sparknlp.start(gpu=True)` | `sparknlp.start(aarch64=True)` | `sparknlp.start(apple_silicon=True)` | + +NOTE: `M1/M2` and `AArch64` are under `experimental` support. Access and support to these architectures are limited by the +community and we had to build most of the dependencies by ourselves to make them compatible. We support these two +architectures, however, they may not work in some environments. + +## Spark Packages + +### Command line (requires internet connection) + +Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, Apache Spark 3.2.x, Apache Spark 3.3.x, Apache Spark 3.4.x, and Apache Spark 3.5.x + +#### Apache Spark 3.x (3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x - Scala 2.12) + +```sh +# CPU + +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 + +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 + +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +The `spark-nlp` has been published to +the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp). + +```sh +# GPU + +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 + +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 + +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 + +``` + +The `spark-nlp-gpu` has been published to +the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu). + +```sh +# AArch64 + +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 + +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 + +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 + +``` + +The `spark-nlp-aarch64` has been published to +the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64). + +```sh +# M1/M2 (Apple Silicon) + +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 + +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 + +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 + +``` + +The `spark-nlp-silicon` has been published to +the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon). + +**NOTE**: In case you are using large pretrained models like UniversalSentenceEncoder, you need to have the following +set in your SparkSession: + +```sh +spark-shell \ + --driver-memory 16g \ + --conf spark.kryoserializer.buffer.max=2000M \ + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +## Scala + +Spark NLP supports Scala 2.12.15 if you are using Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x versions. Our packages are +deployed to Maven central. To add any of our packages as a dependency in your application you can follow these +coordinates: + +### Maven + +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x: + +```xml + + + com.johnsnowlabs.nlp + spark-nlp_2.12 + 5.2.1 + +``` + +**spark-nlp-gpu:** + +```xml + + + com.johnsnowlabs.nlp + spark-nlp-gpu_2.12 + 5.2.1 + +``` + +**spark-nlp-aarch64:** + +```xml + + + com.johnsnowlabs.nlp + spark-nlp-aarch64_2.12 + 5.2.1 + +``` + +**spark-nlp-silicon:** + +```xml + + + com.johnsnowlabs.nlp + spark-nlp-silicon_2.12 + 5.2.1 + +``` + +### SBT + +**spark-nlp** on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x: + +```sbtshell +// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.1" +``` + +**spark-nlp-gpu:** + +```sbtshell +// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.1" +``` + +**spark-nlp-aarch64:** + +```sbtshell +// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.1" +``` + +**spark-nlp-silicon:** + +```sbtshell +// https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.1" +``` + +Maven +Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) + +If you are interested, there is a simple SBT project for Spark NLP to guide you on how to use it in your +projects [Spark NLP SBT Starter](https://github.com/maziyarpanahi/spark-nlp-starter) + +## Python + +Spark NLP supports Python 3.6.x and above depending on your major PySpark version. + +### Python without explicit Pyspark installation + +### Pip/Conda + +If you installed pyspark through pip/conda, you can install `spark-nlp` through the same channel. + +Pip: + +```bash +pip install spark-nlp==5.2.1 +``` + +Conda: + +```bash +conda install -c johnsnowlabs spark-nlp +``` + +PyPI [spark-nlp package](https://pypi.org/project/spark-nlp/) / +Anaconda [spark-nlp package](https://anaconda.org/JohnSnowLabs/spark-nlp) + +Then you'll have to create a SparkSession either from Spark NLP: + +```python +import sparknlp + +spark = sparknlp.start() +``` + +or manually: + +```python +spark = SparkSession.builder + .appName("Spark NLP") + .master("local[*]") + .config("spark.driver.memory", "16G") + .config("spark.driver.maxResultSize", "0") + .config("spark.kryoserializer.buffer.max", "2000M") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1") + .getOrCreate() +``` + +If using local jars, you can use `spark.jars` instead for comma-delimited jar files. For cluster setups, of course, +you'll have to put the jars in a reachable location for all driver and executor nodes. + +**Quick example:** + +```python +import sparknlp +from sparknlp.pretrained import PretrainedPipeline + +# create or get Spark Session + +spark = sparknlp.start() + +sparknlp.version() +spark.version + +# download, load and annotate a text by pre-trained pipeline + +pipeline = PretrainedPipeline('recognize_entities_dl', 'en') +result = pipeline.annotate('The Mona Lisa is a 16th century oil painting created by Leonardo') +``` + +## Compiled JARs + +### Build from source + +#### spark-nlp + +- FAT-JAR for CPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x + +```bash +sbt assembly +``` + +- FAT-JAR for GPU on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x + +```bash +sbt -Dis_gpu=true assembly +``` + +- FAT-JAR for M! on Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x + +```bash +sbt -Dis_silicon=true assembly +``` + +### Using the jar manually + +If for some reason you need to use the JAR, you can either download the Fat JARs provided here or download it +from [Maven Central](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp). + +To add JARs to spark programs use the `--jars` option: + +```sh +spark-shell --jars spark-nlp.jar +``` + +The preferred way to use the library when running spark programs is using the `--packages` option as specified in +the `spark-packages` section. + +## Apache Zeppelin + +Use either one of the following options + +- Add the following Maven Coordinates to the interpreter's library list + +```bash +com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +- Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is + available to driver path + +### Python in Zeppelin + +Apart from the previous step, install the python module through pip ```bash -bundle update -bundle install -bundle exec jekyll serve +pip install spark-nlp==5.2.1 +``` + +Or you can install `spark-nlp` from inside Zeppelin by using Conda: + +```bash +python.conda install -c johnsnowlabs spark-nlp +``` + +Configure Zeppelin properly, use cells with %spark.pyspark or any interpreter name you chose. + +Finally, in Zeppelin interpreter settings, make sure you set properly zeppelin.python to the python you want to use and +install the pip library with (e.g. `python3`). + +An alternative option would be to set `SPARK_SUBMIT_OPTIONS` (zeppelin-env.sh) and make sure `--packages` is there as +shown earlier since it includes both scala and python side installation. + +## Jupyter Notebook (Python) + +**Recommended:** + +The easiest way to get this done on Linux and macOS is to simply install `spark-nlp` and `pyspark` PyPI packages and +launch the Jupyter from the same Python environment: -# Server address: http://127.0.0.1:4000 +```sh +$ conda create -n sparknlp python=3.8 -y +$ conda activate sparknlp +# spark-nlp by default is based on pyspark 3.x +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 jupyter +$ jupyter notebook ``` -## How to build the PyDocs +Then you can use `python3` kernel to run your code with creating SparkSession via `spark = sparknlp.start()`. + +**Optional:** + +If you are in different operating systems and require to make Jupyter Notebook run by using pyspark, you can follow +these steps: + +```bash +export SPARK_HOME=/path/to/your/spark/folder +export PYSPARK_PYTHON=python3 +export PYSPARK_DRIVER_PYTHON=jupyter +export PYSPARK_DRIVER_PYTHON_OPTS=notebook + +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` + +If not using pyspark at all, you'll have to run the instructions +pointed [here](#python-without-explicit-pyspark-installation) + +## Google Colab Notebook + +Google Colab is perhaps the easiest way to get started with spark-nlp. It requires no installation or setup other than +having a Google account. + +Run the following code in Google Colab notebook and start using spark-nlp right away. + +```sh +# This is only to setup PySpark and Spark NLP on Colab +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash +``` + +This script comes with the two options to define `pyspark` and `spark-nlp` versions via options: + +```sh +# -p is for pyspark +# -s is for spark-nlp +# -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage +# by default they are set to the latest +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 +``` + +[Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) +is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP +pretrained pipelines. + +## Kaggle Kernel + +Run the following code in Kaggle Kernel and start using spark-nlp right away. + +```sh +# Let's setup Kaggle for Spark NLP and PySpark +!wget https://setup.johnsnowlabs.com/kaggle.sh -O - | bash +``` + +This script comes with the two options to define `pyspark` and `spark-nlp` versions via options: + +```sh +# -p is for pyspark +# -s is for spark-nlp +# -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage +# by default they are set to the latest +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 +``` + +[Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live +demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP pretrained pipeline. + +## Databricks Cluster + +1. Create a cluster if you don't have one already + +2. On a new cluster or existing one you need to add the following to the `Advanced Options -> Spark` tab: + + ```bash + spark.kryoserializer.buffer.max 2000M + spark.serializer org.apache.spark.serializer.KryoSerializer + ``` + +3. In `Libraries` tab inside your cluster you need to follow these steps: + + 3.1. Install New -> PyPI -> `spark-nlp==5.2.1` -> Install + + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1` -> Install + +4. Now you can attach your notebook to the cluster and use Spark NLP! + +NOTE: Databricks' runtimes support different Apache Spark major releases. Please make sure you choose the correct Spark +NLP Maven package name (Maven Coordinate) for your runtime from +our [Packages Cheatsheet](https://github.com/JohnSnowLabs/spark-nlp#packages-cheatsheet) + +## EMR Cluster + +To launch EMR clusters with Apache Spark/PySpark and Spark NLP correctly you need to have bootstrap and software +configuration. + +A sample of your bootstrap script + +```.sh +#!/bin/bash +set -x -e + +echo -e 'export PYSPARK_PYTHON=/usr/bin/python3 +export HADOOP_CONF_DIR=/etc/hadoop/conf +export SPARK_JARS_DIR=/usr/lib/spark/jars +export SPARK_HOME=/usr/lib/spark' >> $HOME/.bashrc && source $HOME/.bashrc + +sudo python3 -m pip install awscli boto spark-nlp + +set +x +exit 0 + +``` + +A sample of your software configuration in JSON on S3 (must be public access): + +```.json +[{ + "Classification": "spark-env", + "Configurations": [{ + "Classification": "export", + "Properties": { + "PYSPARK_PYTHON": "/usr/bin/python3" + } + }] +}, +{ + "Classification": "spark-defaults", + "Properties": { + "spark.yarn.stagingDir": "hdfs:///tmp", + "spark.yarn.preserve.staging.files": "true", + "spark.kryoserializer.buffer.max": "2000M", + "spark.serializer": "org.apache.spark.serializer.KryoSerializer", + "spark.driver.maxResultSize": "0", + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1" + } +}] +``` + +A sample of AWS CLI to launch EMR cluster: + +```.sh +aws emr create-cluster \ +--name "Spark NLP 5.2.1" \ +--release-label emr-6.2.0 \ +--applications Name=Hadoop Name=Spark Name=Hive \ +--instance-type m4.4xlarge \ +--instance-count 3 \ +--use-default-roles \ +--log-uri "s3:///" \ +--bootstrap-actions Path=s3:///emr-bootstrap.sh,Name=custome \ +--configurations "https:///sparknlp-config.json" \ +--ec2-attributes KeyName=,EmrManagedMasterSecurityGroup=,EmrManagedSlaveSecurityGroup= \ +--profile +``` + +## GCP Dataproc + +1. Create a cluster if you don't have one already as follows. + +At gcloud shell: + +```bash +gcloud services enable dataproc.googleapis.com \ + compute.googleapis.com \ + storage-component.googleapis.com \ + bigquery.googleapis.com \ + bigquerystorage.googleapis.com +``` + +```bash +REGION= +``` + +```bash +BUCKET_NAME= +gsutil mb -c standard -l ${REGION} gs://${BUCKET_NAME} +``` + +```bash +REGION= +ZONE= +CLUSTER_NAME= +BUCKET_NAME= +``` + +You can set image-version, master-machine-type, worker-machine-type, +master-boot-disk-size, worker-boot-disk-size, num-workers as your needs. +If you use the previous image-version from 2.0, you should also add ANACONDA to optional-components. +And, you should enable gateway. +Don't forget to set the maven coordinates for the jar in properties. + +```bash +gcloud dataproc clusters create ${CLUSTER_NAME} \ + --region=${REGION} \ + --zone=${ZONE} \ + --image-version=2.0 \ + --master-machine-type=n1-standard-4 \ + --worker-machine-type=n1-standard-2 \ + --master-boot-disk-size=128GB \ + --worker-boot-disk-size=128GB \ + --num-workers=2 \ + --bucket=${BUCKET_NAME} \ + --optional-components=JUPYTER \ + --enable-component-gateway \ + --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ + --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. + +3. Now, you can attach your notebook to the cluster and use the Spark NLP! + +## Spark NLP Configuration + +You can change the following Spark NLP configurations via Spark Configuration: + +| Property Name | Default | Meaning | +|---------------------------------------------------------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `spark.jsl.settings.pretrained.cache_folder` | `~/cache_pretrained` | The location to download and extract pretrained `Models` and `Pipelines`. By default, it will be in User's Home directory under `cache_pretrained` directory | +| `spark.jsl.settings.storage.cluster_tmp_dir` | `hadoop.tmp.dir` | The location to use on a cluster for temporarily files such as unpacking indexes for WordEmbeddings. By default, this locations is the location of `hadoop.tmp.dir` set via Hadoop configuration for Apache Spark. NOTE: `S3` is not supported and it must be local, HDFS, or DBFS | +| `spark.jsl.settings.annotator.log_folder` | `~/annotator_logs` | The location to save logs from annotators during training such as `NerDLApproach`, `ClassifierDLApproach`, `SentimentDLApproach`, `MultiClassifierDLApproach`, etc. By default, it will be in User's Home directory under `annotator_logs` directory | +| `spark.jsl.settings.aws.credentials.access_key_id` | `None` | Your AWS access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.credentials.secret_access_key` | `None` | Your AWS secret access key to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.credentials.session_token` | `None` | Your AWS MFA session token to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.s3_bucket` | `None` | Your AWS S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.aws.region` | `None` | Your AWS region to use your S3 bucket to store log files of training models or access tensorflow graphs used in `NerDLApproach` | +| `spark.jsl.settings.onnx.gpuDeviceId` | `0` | Constructs CUDA execution provider options for the specified non-negative device id. | +| `spark.jsl.settings.onnx.intraOpNumThreads` | `6` | Sets the size of the CPU thread pool used for executing a single graph, if executing on a CPU. | +| `spark.jsl.settings.onnx.optimizationLevel` | `ALL_OPT` | Sets the optimization level of this options object, overriding the old setting. | +| `spark.jsl.settings.onnx.executionMode` | `SEQUENTIAL` | Sets the execution mode of this options object, overriding the old setting. | + +### How to set Spark NLP Configuration + +**SparkSession:** + +You can use `.config()` during SparkSession creation to set Spark NLP configurations. + +```python +from pyspark.sql import SparkSession + +spark = SparkSession.builder + .master("local[*]") + .config("spark.driver.memory", "16G") + .config("spark.driver.maxResultSize", "0") + .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config("spark.kryoserializer.buffer.max", "2000m") + .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") + .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1") + .getOrCreate() +``` + +**spark-shell:** + +```sh +spark-shell \ + --driver-memory 16g \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer + --conf spark.kryoserializer.buffer.max=2000M \ + --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ + --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +**pyspark:** + +```sh +pyspark \ + --driver-memory 16g \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer + --conf spark.kryoserializer.buffer.max=2000M \ + --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ + --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 +``` + +**Databricks:** + +On a new cluster or existing one you need to add the following to the `Advanced Options -> Spark` tab: + +```bash +spark.kryoserializer.buffer.max 2000M +spark.serializer org.apache.spark.serializer.KryoSerializer +spark.jsl.settings.pretrained.cache_folder dbfs:/PATH_TO_CACHE +spark.jsl.settings.storage.cluster_tmp_dir dbfs:/PATH_TO_STORAGE +spark.jsl.settings.annotator.log_folder dbfs:/PATH_TO_LOGS +``` + +NOTE: If this is an existing cluster, after adding new configs or changing existing properties you need to restart it. + +### S3 Integration + +In Spark NLP we can define S3 locations to: + +- Export log files of training models +- Store tensorflow graphs used in `NerDLApproach` + +**Logging:** + +To configure S3 path for logging while training models. We need to set up AWS credentials as well as an S3 path + +```bash +spark.conf.set("spark.jsl.settings.annotator.log_folder", "s3://my/s3/path/logs") +spark.conf.set("spark.jsl.settings.aws.credentials.access_key_id", "MY_KEY_ID") +spark.conf.set("spark.jsl.settings.aws.credentials.secret_access_key", "MY_SECRET_ACCESS_KEY") +spark.conf.set("spark.jsl.settings.aws.s3_bucket", "my.bucket") +spark.conf.set("spark.jsl.settings.aws.region", "my-region") +``` + +Now you can check the log on your S3 path defined in *spark.jsl.settings.annotator.log_folder* property. +Make sure to use the prefix *s3://*, otherwise it will use the default configuration. + +**Tensorflow Graphs:** + +To reference S3 location for downloading graphs. We need to set up AWS credentials + +```bash +spark.conf.set("spark.jsl.settings.aws.credentials.access_key_id", "MY_KEY_ID") +spark.conf.set("spark.jsl.settings.aws.credentials.secret_access_key", "MY_SECRET_ACCESS_KEY") +spark.conf.set("spark.jsl.settings.aws.region", "my-region") +``` + +**MFA Configuration:** + +In case your AWS account is configured with MFA. You will need first to get temporal credentials and add session token +to the configuration as shown in the examples below +For logging: + +```bash +spark.conf.set("spark.jsl.settings.aws.credentials.session_token", "MY_TOKEN") +``` + +An example of a bash script that gets temporal AWS credentials can be +found [here](https://github.com/JohnSnowLabs/spark-nlp/blob/master/scripts/aws_tmp_credentials.sh) +This script requires three arguments: + +```bash +./aws_tmp_credentials.sh iam_user duration serial_number +``` + +## Pipelines and Models + +### Pipelines + +**Quick example:** + +```scala +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline +import com.johnsnowlabs.nlp.SparkNLP + +SparkNLP.version() + +val testData = spark.createDataFrame(Seq( + (1, "Google has announced the release of a beta version of the popular TensorFlow machine learning library"), + (2, "Donald John Trump (born June 14, 1946) is the 45th and current president of the United States") +)).toDF("id", "text") + +val pipeline = PretrainedPipeline("explain_document_dl", lang = "en") + +val annotation = pipeline.transform(testData) + +annotation.show() +/* +import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline +import com.johnsnowlabs.nlp.SparkNLP +2.5.0 +testData: org.apache.spark.sql.DataFrame = [id: int, text: string] +pipeline: com.johnsnowlabs.nlp.pretrained.PretrainedPipeline = PretrainedPipeline(explain_document_dl,en,public/models) +annotation: org.apache.spark.sql.DataFrame = [id: int, text: string ... 10 more fields] ++---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ +| id| text| document| token| sentence| checked| lemma| stem| pos| embeddings| ner| entities| ++---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ +| 1|Google has announ...|[[document, 0, 10...|[[token, 0, 5, Go...|[[document, 0, 10...|[[token, 0, 5, Go...|[[token, 0, 5, Go...|[[token, 0, 5, go...|[[pos, 0, 5, NNP,...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 0, 5, Go...| +| 2|The Paris metro w...|[[document, 0, 11...|[[token, 0, 2, Th...|[[document, 0, 11...|[[token, 0, 2, Th...|[[token, 0, 2, Th...|[[token, 0, 2, th...|[[pos, 0, 2, DT, ...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 4, 8, Pa...| ++---+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+ +*/ + +annotation.select("entities.result").show(false) + +/* ++----------------------------------+ +|result | ++----------------------------------+ +|[Google, TensorFlow] | +|[Donald John Trump, United States]| ++----------------------------------+ +*/ +``` + +#### Showing Available Pipelines + +There are functions in Spark NLP that will list all the available Pipelines +of a particular language for you: + +```scala +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader + +ResourceDownloader.showPublicPipelines(lang = "en") +/* ++--------------------------------------------+------+---------+ +| Pipeline | lang | version | ++--------------------------------------------+------+---------+ +| dependency_parse | en | 2.0.2 | +| analyze_sentiment_ml | en | 2.0.2 | +| check_spelling | en | 2.1.0 | +| match_datetime | en | 2.1.0 | + ... +| explain_document_ml | en | 3.1.3 | ++--------------------------------------------+------+---------+ +*/ +``` + +Or if we want to check for a particular version: + +```scala +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader + +ResourceDownloader.showPublicPipelines(lang = "en", version = "3.1.0") +/* ++---------------------------------------+------+---------+ +| Pipeline | lang | version | ++---------------------------------------+------+---------+ +| dependency_parse | en | 2.0.2 | + ... +| clean_slang | en | 3.0.0 | +| clean_pattern | en | 3.0.0 | +| check_spelling | en | 3.0.0 | +| dependency_parse | en | 3.0.0 | ++---------------------------------------+------+---------+ +*/ +``` + +#### Please check out our Models Hub for the full list of [pre-trained pipelines](https://sparknlp.org/models) with examples, demos, benchmarks, and more + +### Models + +**Some selected languages: +** `Afrikaans, Arabic, Armenian, Basque, Bengali, Breton, Bulgarian, Catalan, Czech, Dutch, English, Esperanto, Finnish, French, Galician, German, Greek, Hausa, Hebrew, Hindi, Hungarian, Indonesian, Irish, Italian, Japanese, Latin, Latvian, Marathi, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Somali, Southern Sotho, Spanish, Swahili, Swedish, Tswana, Turkish, Ukrainian, Zulu` + +**Quick online example:** + +```python +# load NER model trained by deep learning approach and GloVe word embeddings +ner_dl = NerDLModel.pretrained('ner_dl') +# load NER model trained by deep learning approach and BERT word embeddings +ner_bert = NerDLModel.pretrained('ner_dl_bert') +``` + +```scala +// load French POS tagger model trained by Universal Dependencies +val french_pos = PerceptronModel.pretrained("pos_ud_gsd", lang = "fr") +// load Italian LemmatizerModel +val italian_lemma = LemmatizerModel.pretrained("lemma_dxc", lang = "it") +```` + +**Quick offline example:** + +- Loading `PerceptronModel` annotator model inside Spark NLP Pipeline + +```scala +val french_pos = PerceptronModel.load("/tmp/pos_ud_gsd_fr_2.0.2_2.4_1556531457346/") + .setInputCols("document", "token") + .setOutputCol("pos") +``` + +#### Showing Available Models + +There are functions in Spark NLP that will list all the available Models +of a particular Annotator and language for you: + +```scala +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader + +ResourceDownloader.showPublicModels(annotator = "NerDLModel", lang = "en") +/* ++---------------------------------------------+------+---------+ +| Model | lang | version | ++---------------------------------------------+------+---------+ +| onto_100 | en | 2.1.0 | +| onto_300 | en | 2.1.0 | +| ner_dl_bert | en | 2.2.0 | +| onto_100 | en | 2.4.0 | +| ner_conll_elmo | en | 3.2.2 | ++---------------------------------------------+------+---------+ +*/ +``` + +Or if we want to check for a particular version: + +```scala +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader + +ResourceDownloader.showPublicModels(annotator = "NerDLModel", lang = "en", version = "3.1.0") +/* ++----------------------------+------+---------+ +| Model | lang | version | ++----------------------------+------+---------+ +| onto_100 | en | 2.1.0 | +| ner_aspect_based_sentiment | en | 2.6.2 | +| ner_weibo_glove_840B_300d | en | 2.6.2 | +| nerdl_atis_840b_300d | en | 2.7.1 | +| nerdl_snips_100d | en | 2.7.3 | ++----------------------------+------+---------+ +*/ +``` + +And to see a list of available annotators, you can use: + +```scala +import com.johnsnowlabs.nlp.pretrained.ResourceDownloader + +ResourceDownloader.showAvailableAnnotators() +/* +AlbertEmbeddings +AlbertForTokenClassification +AssertionDLModel +... +XlmRoBertaSentenceEmbeddings +XlnetEmbeddings +*/ +``` + +#### Please check out our Models Hub for the full list of [pre-trained models](https://sparknlp.org/models) with examples, demo, benchmark, and more + +## Offline + +Spark NLP library and all the pre-trained models/pipelines can be used entirely offline with no access to the Internet. +If you are behind a proxy or a firewall with no access to the Maven repository (to download packages) or/and no access +to S3 (to automatically download models and pipelines), you can simply follow the instructions to have Spark NLP without +any limitations offline: + +- Instead of using the Maven package, you need to load our Fat JAR +- Instead of using PretrainedPipeline for pretrained pipelines or the `.pretrained()` function to download pretrained + models, you will need to manually download your pipeline/model from [Models Hub](https://sparknlp.org/models), + extract it, and load it. + +Example of `SparkSession` with Fat JAR to have Spark NLP offline: + +```python +spark = SparkSession.builder + .appName("Spark NLP") + .master("local[*]") + .config("spark.driver.memory", "16G") + .config("spark.driver.maxResultSize", "0") + .config("spark.kryoserializer.buffer.max", "2000M") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.1.jar") + .getOrCreate() +``` + +- You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), + please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark + version (3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need + to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( + i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.1.jar`) + +Example of using pretrained Models and Pipelines in offline: + +```python +# instead of using pretrained() for online: +# french_pos = PerceptronModel.pretrained("pos_ud_gsd", lang="fr") +# you download this model, extract it, and use .load +french_pos = PerceptronModel.load("/tmp/pos_ud_gsd_fr_2.0.2_2.4_1556531457346/") + .setInputCols("document", "token") + .setOutputCol("pos") + +# example for pipelines +# instead of using PretrainedPipeline +# pipeline = PretrainedPipeline('explain_document_dl', lang='en') +# you download this pipeline, extract it, and use PipelineModel +PipelineModel.load("/tmp/explain_document_dl_en_2.0.2_2.4_1556530585689/") +``` + +- Since you are downloading and loading models/pipelines manually, this means Spark NLP is not downloading the most + recent and compatible models/pipelines for you. Choosing the right model/pipeline is on you +- If you are local, you can load the model/pipeline from your local FileSystem, however, if you are in a cluster setup + you need to put the model/pipeline on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( + i.e., `hdfs:///tmp/explain_document_dl_en_2.0.2_2.4_1556530585689/`) + +## Examples + +Need more **examples**? Check out our dedicated [Spark NLP Examples](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples) +repository to showcase all Spark NLP use cases! + +Also, don't forget to check [Spark NLP in Action](https://sparknlp.org/demo) built by Streamlit. + +### All examples: [spark-nlp/examples](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples) + +## FAQ + +[Check our Articles and Videos page here](https://sparknlp.org/learn) + +## Citation + +We have published a [paper](https://www.sciencedirect.com/science/article/pii/S2665963821000063) that you can cite for +the Spark NLP library: + +```bibtex +@article{KOCAMAN2021100058, + title = {Spark NLP: Natural language understanding at scale}, + journal = {Software Impacts}, + pages = {100058}, + year = {2021}, + issn = {2665-9638}, + doi = {https://doi.org/10.1016/j.simpa.2021.100058}, + url = {https://www.sciencedirect.com/science/article/pii/S2665963.2.300063}, + author = {Veysel Kocaman and David Talby}, + keywords = {Spark, Natural language processing, Deep learning, Tensorflow, Cluster}, + abstract = {Spark NLP is a Natural Language Processing (NLP) library built on top of Apache Spark ML. It provides simple, performant & accurate NLP annotations for machine learning pipelines that can scale easily in a distributed environment. Spark NLP comes with 1100+ pretrained pipelines and models in more than 192+ languages. It supports nearly all the NLP tasks and modules that can be used seamlessly in a cluster. Downloaded more than 2.7 million times and experiencing 9x growth since January 2020, Spark NLP is used by 54% of healthcare organizations as the world’s most widely used NLP library in the enterprise.} + } +} +``` + +## Contributing + +We appreciate any sort of contributions: -1. Install requirements `requirements_doc.txt` -2. run `make html` +- ideas +- feedback +- documentation +- bug reports +- NLP training and testing corpora +- Development and testing -The html will be available under `_build/html/index.html`. +Clone the repo and submit your pull-requests! Or directly create issues in this repo. -## Note +## John Snow Labs -The folder `_autosummary` should not be committed, as it is generated from sphinx itself. +[http://johnsnowlabs.com](http://johnsnowlabs.com) diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index 4fd347e6278dd4..e55be733e841c7 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -201,7 +201,7 @@

{{ _section.title }}

{% highlight bash %} # Using PyPI - $ pip install spark-nlp==5.2.0 + $ pip install spark-nlp==5.2.1 # Using Anaconda/Conda $ conda install -c johnsnowlabs spark-nlp diff --git a/docs/en/concepts.md b/docs/en/concepts.md index 85d15eab96d730..bb97fcad4144cf 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -66,7 +66,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index a7efd34e02a683..40f98d42b08c1d 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -18,7 +18,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 ```
@@ -40,7 +40,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 5.2.0 +!bash colab.sh -p 3.2.3 -s 5.2.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index dd33bccc585f76..8a15d714bd1ba0 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 5.2.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.2.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index fb7d808a1f1778..8c6c931dcc48fb 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -17,22 +17,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==5.2.0 +pip install spark-nlp==5.2.1 # Install Spark NLP from Anaconda/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-5.2.0.jar +spark-shell --jars spark-nlp-assembly-5.2.1.jar ```
@@ -55,7 +55,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -83,7 +83,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1")\ .getOrCreate() ``` @@ -100,7 +100,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp_2.12 - 5.2.0 + 5.2.1 ``` @@ -111,7 +111,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.2.0 + 5.2.1 ``` @@ -122,7 +122,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.2.0 + 5.2.1 ``` @@ -133,7 +133,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.2.0 + 5.2.1 ``` @@ -145,28 +145,28 @@ spark = SparkSession.builder \ ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.1" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.1" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.1" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.1" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -248,7 +248,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.2.0 + 5.2.1 ``` @@ -256,7 +256,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.1" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -293,7 +293,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 5.2.0, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.2.1, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -341,7 +341,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.0 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -363,7 +363,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 5.2.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.2.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -445,7 +445,7 @@ Spark NLP 5.2.0 has been tested and is compatible with the following runtimes: 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -465,7 +465,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 5.2.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.2.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -528,7 +528,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1" } } ] @@ -538,7 +538,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 5.2.0" \ +--name "Spark NLP 5.2.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -803,7 +803,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1*. @@ -831,12 +831,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.0.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.1.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.0.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index 1992a65722fa27..894076a6812caf 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 5.2.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.2.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/python/README.md b/python/README.md index 9c7eac3bd0e865..e2319c0eecdd89 100644 --- a/python/README.md +++ b/python/README.md @@ -173,7 +173,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.2.0 is built with ONNX 1.16.3 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.2.1 is built with ONNX 1.16.3 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -189,7 +189,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -234,7 +234,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.2.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x +Spark NLP *5.2.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x | Spark NLP | Apache Spark 3.5.x | Apache Spark 3.4.x | Apache Spark 3.3.x | Apache Spark 3.2.x | Apache Spark 3.1.x | Apache Spark 3.0.x | Apache Spark 2.4.x | Apache Spark 2.3.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -276,7 +276,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.2.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.2.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -343,7 +343,7 @@ Spark NLP 5.2.0 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.2.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.2.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -390,11 +390,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` The `spark-nlp` has been published to @@ -403,11 +403,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.2.1 ``` @@ -417,11 +417,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.2.1 ``` @@ -431,11 +431,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.2.1 ``` @@ -449,7 +449,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` ## Scala @@ -467,7 +467,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.2.0 + 5.2.1 ``` @@ -478,7 +478,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.2.0 + 5.2.1 ``` @@ -489,7 +489,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.2.0 + 5.2.1 ``` @@ -500,7 +500,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.2.0 + 5.2.1 ``` @@ -510,28 +510,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.2.1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.2.1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.2.1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.2.1" ``` Maven @@ -553,7 +553,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.2.0 +pip install spark-nlp==5.2.1 ``` Conda: @@ -582,7 +582,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1") .getOrCreate() ``` @@ -653,7 +653,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -664,7 +664,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.2.0 +pip install spark-nlp==5.2.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -692,7 +692,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.2.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.2.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -709,7 +709,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -736,7 +736,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -759,7 +759,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.2.1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -778,9 +778,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.2.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.2.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -831,7 +831,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1" } }] ``` @@ -840,7 +840,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.2.0" \ +--name "Spark NLP 5.2.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -904,7 +904,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -943,7 +943,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1") .getOrCreate() ``` @@ -957,7 +957,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` **pyspark:** @@ -970,7 +970,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.2.1 ``` **Databricks:** @@ -1242,7 +1242,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.0.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.2.1.jar") .getOrCreate() ``` @@ -1251,7 +1251,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, 3.4.x, and 3.5.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.0.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.2.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index 1fa63e4c2b5fa4..3548702cddc7de 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "5.2.0" +release = "5.2.1" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index 5283ae20da0ca3..ab6ce77fd78858 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.2.0', # Required + version='5.2.1', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 049823768b689f..edc01284b39039 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "5.2.0" + current_version = "5.2.1" if params is None: params = {} @@ -309,4 +309,4 @@ def version(): str The current Spark NLP version. """ - return '5.2.0' + return '5.2.1' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index 2a40b58a0c62f5..2f93d717d566fa 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.2.0" +SPARKNLP="5.2.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index a850a969ddac08..0ef87581fbf913 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.2.0" +SPARKNLP="5.2.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index 15bddb60f0d8b1..868086882fbf63 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.2.0" +SPARKNLP="5.2.1" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 55d7cecbf3c88c..ae3eba79063575 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "5.2.0" + val currentVersion = "5.2.1" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 7bf01c8aa86a9a..861fa8f73351e7 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "5.2.0" + val version: String = "5.2.1" } From 92535c34ded90fdc3de6e08ac6bd3cabe472209b Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 27 Dec 2023 18:36:49 +0000 Subject: [PATCH 12/14] Update Scala and Python APIs --- docs/api/com/index.html | 8 +- .../com/johnsnowlabs/client/CloudClient.html | 8 +- .../com/johnsnowlabs/client/CloudManager.html | 8 +- .../johnsnowlabs/client/CloudResources$.html | 8 +- .../com/johnsnowlabs/client/CloudStorage.html | 8 +- .../client/aws/AWSAnonymousCredentials.html | 8 +- .../client/aws/AWSBasicCredentials.html | 8 +- .../johnsnowlabs/client/aws/AWSClient.html | 8 +- .../client/aws/AWSCredentialsProvider.html | 8 +- .../johnsnowlabs/client/aws/AWSGateway.html | 8 +- .../client/aws/AWSProfileCredentials.html | 8 +- .../client/aws/AWSTokenCredentials.html | 8 +- .../client/aws/CredentialParams.html | 8 +- .../johnsnowlabs/client/aws/Credentials.html | 8 +- .../com/johnsnowlabs/client/aws/index.html | 8 +- .../client/azure/AzureClient.html | 8 +- .../client/azure/AzureGateway.html | 8 +- .../com/johnsnowlabs/client/azure/index.html | 8 +- .../johnsnowlabs/client/gcp/GCPClient.html | 8 +- .../johnsnowlabs/client/gcp/GCPGateway.html | 8 +- .../com/johnsnowlabs/client/gcp/index.html | 8 +- docs/api/com/johnsnowlabs/client/index.html | 8 +- .../client/util/CloudHelper$.html | 8 +- .../com/johnsnowlabs/client/util/index.html | 8 +- .../johnsnowlabs/collections/SearchTrie$.html | 8 +- .../johnsnowlabs/collections/SearchTrie.html | 8 +- .../collections/StorageSearchTrie$.html | 8 +- .../collections/StorageSearchTrie.html | 8 +- .../com/johnsnowlabs/collections/index.html | 8 +- docs/api/com/johnsnowlabs/index.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/DeBerta.html | 8 +- .../ml/ai/MergeTokenStrategy$.html | 8 +- .../johnsnowlabs/ml/ai/OpenAICompletion.html | 8 +- .../johnsnowlabs/ml/ai/OpenAIEmbeddings.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/index.html | 8 +- .../com/johnsnowlabs/ml/ai/model/Choice.html | 8 +- .../ml/ai/model/CompletionResponse.html | 8 +- .../ml/ai/model/EmbeddingData.html | 8 +- .../ml/ai/model/TextEmbeddingResponse.html | 8 +- .../com/johnsnowlabs/ml/ai/model/Usage.html | 8 +- .../johnsnowlabs/ml/ai/model/UsageData.html | 8 +- .../com/johnsnowlabs/ml/ai/model/index.html | 8 +- .../ml/ai/seq2seq/DecoderProcessor.html | 8 +- .../ml/ai/seq2seq/OnnxT5EncoderDecoder.html | 8 +- .../ml/ai/seq2seq/T5EncoderDecoder.html | 8 +- .../com/johnsnowlabs/ml/ai/seq2seq/index.html | 8 +- .../ml/ai/t5/OnnxT5EncoderDecoder.html | 8 +- .../t5/T5EncoderDecoder$DecoderProcessor.html | 8 +- .../ml/ai/t5/T5EncoderDecoder.html | 8 +- docs/api/com/johnsnowlabs/ml/ai/t5/index.html | 8 +- .../ml/ai/util/Generation/Generate.html | 8 +- .../ai/util/Generation/GenerationConfig.html | 8 +- .../ml/ai/util/Generation/Logit/Logit.html | 8 +- .../ForcedTokenLogitProcessor.html | 8 +- .../Logit/LogitProcess/LogitProcessor.html | 8 +- .../LogitProcess/MinLengthLogitProcessor.html | 8 +- .../NoRepeatNgramsLogitProcessor.html | 8 +- .../RepetitionPenaltyLogitProcessor.html | 8 +- .../LogitProcess/SuppressLogitProcessor.html | 8 +- .../Generation/Logit/LogitProcess/index.html | 8 +- .../Generation/Logit/LogitProcessorList.html | 8 +- .../Logit/LogitWarper/LogitWarper.html | 8 +- .../LogitWarper/TemperatureLogitWarper.html | 8 +- .../Logit/LogitWarper/TopKLogitWarper.html | 8 +- .../Logit/LogitWarper/TopPLogitWarper.html | 8 +- .../Generation/Logit/LogitWarper/index.html | 8 +- .../ml/ai/util/Generation/Logit/index.html | 8 +- .../Generation/Search/BeamHypotheses.html | 8 +- .../ai/util/Generation/Search/BeamScorer.html | 8 +- .../Generation/Search/BeamSearchScorer.html | 8 +- .../ml/ai/util/Generation/Search/index.html | 8 +- .../ml/ai/util/Generation/index.html | 8 +- .../com/johnsnowlabs/ml/ai/util/index.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/Attr.html | 8 +- .../com/johnsnowlabs/ml/crf/AttrFeature.html | 8 +- .../api/com/johnsnowlabs/ml/crf/AttrStat.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfDataset.html | 8 +- .../com/johnsnowlabs/ml/crf/CrfParams.html | 8 +- .../johnsnowlabs/ml/crf/DatasetEncoder.html | 8 +- .../johnsnowlabs/ml/crf/DatasetMetadata.html | 8 +- .../johnsnowlabs/ml/crf/DatasetReader$.html | 8 +- .../johnsnowlabs/ml/crf/EdgeCalculator$.html | 8 +- .../com/johnsnowlabs/ml/crf/FbCalculator.html | 8 +- .../api/com/johnsnowlabs/ml/crf/Instance.html | 8 +- .../johnsnowlabs/ml/crf/InstanceLabels.html | 8 +- .../johnsnowlabs/ml/crf/L2DecayStrategy.html | 8 +- .../johnsnowlabs/ml/crf/LinearChainCrf.html | 8 +- .../ml/crf/LinearChainCrfModel.html | 8 +- .../ml/crf/SerializedDatasetMetadata.html | 8 +- .../ml/crf/SerializedLinearChainCrfModel.html | 8 +- .../ml/crf/SparseArray$$SeqWrapper.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray$.html | 8 +- .../com/johnsnowlabs/ml/crf/SparseArray.html | 8 +- .../ml/crf/TextSentenceAttrs.html | 8 +- .../ml/crf/TextSentenceLabels.html | 8 +- .../com/johnsnowlabs/ml/crf/Transition.html | 8 +- .../com/johnsnowlabs/ml/crf/VectorMath$.html | 8 +- .../com/johnsnowlabs/ml/crf/WordAttrs.html | 8 +- docs/api/com/johnsnowlabs/ml/crf/index.html | 8 +- docs/api/com/johnsnowlabs/ml/index.html | 8 +- .../com/johnsnowlabs/ml/onnx/OnnxSession.html | 8 +- .../OnnxWrapper$$EncoderDecoderWrappers.html | 8 +- .../johnsnowlabs/ml/onnx/OnnxWrapper$.html | 8 +- .../com/johnsnowlabs/ml/onnx/OnnxWrapper.html | 8 +- .../johnsnowlabs/ml/onnx/ReadOnnxModel.html | 10 +- ...sources$$implicits$$OnnxSessionResult.html | 8 +- .../ml/onnx/TensorResources$$implicits$.html | 8 +- .../ml/onnx/TensorResources$.html | 8 +- .../johnsnowlabs/ml/onnx/TensorResources.html | 8 +- .../johnsnowlabs/ml/onnx/WriteOnnxModel.html | 10 +- docs/api/com/johnsnowlabs/ml/onnx/index.html | 8 +- .../tensorflow/ClassifierDatasetEncoder.html | 8 +- .../ClassifierDatasetEncoderParams.html | 8 +- .../ml/tensorflow/DatasetEncoderParams.html | 8 +- .../johnsnowlabs/ml/tensorflow/Logging.html | 8 +- .../ml/tensorflow/ModelSignature.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch$.html | 8 +- .../johnsnowlabs/ml/tensorflow/NerBatch.html | 8 +- .../ml/tensorflow/NerDatasetEncoder.html | 8 +- .../ml/tensorflow/ReadTensorflowModel.html | 10 +- .../ml/tensorflow/SentenceGrouper.html | 8 +- .../ml/tensorflow/TensorResources$.html | 8 +- .../ml/tensorflow/TensorResources.html | 8 +- .../ml/tensorflow/TensorflowClassifier.html | 8 +- .../ml/tensorflow/TensorflowWrapper$.html | 8 +- .../ml/tensorflow/TensorflowWrapper.html | 8 +- .../johnsnowlabs/ml/tensorflow/Variables.html | 8 +- .../ml/tensorflow/WriteTensorflowModel.html | 10 +- .../com/johnsnowlabs/ml/tensorflow/index.html | 8 +- .../sentencepiece/ReadSentencePieceModel.html | 8 +- .../sentencepiece/SentencePieceException.html | 8 +- .../sentencepiece/SentencePieceProcessor.html | 8 +- .../sentencepiece/SentencePieceWrapper$.html | 8 +- .../WriteSentencePieceModel.html | 8 +- .../ml/tensorflow/sentencepiece/index.html | 8 +- ...delSignatureConstants$$AttentionMask$.html | 8 +- ...lSignatureConstants$$AttentionMaskV1$.html | 8 +- ...SignatureConstants$$AudioValuesInput$.html | 8 +- ...s$$CachedDecoderEncoderAttentionMask$.html | 8 +- ...stants$$CachedDecoderEncoderInputIds$.html | 8 +- ...eConstants$$CachedDecoderInputCache1$.html | 8 +- ...eConstants$$CachedDecoderInputCache2$.html | 8 +- ...tureConstants$$CachedDecoderInputIds$.html | 8 +- ...natureConstants$$CachedEncoderOutput$.html | 8 +- ...gnatureConstants$$CachedLogitsOutput$.html | 8 +- ...delSignatureConstants$$CachedOutPut2$.html | 8 +- ...delSignatureConstants$$CachedOutput1$.html | 8 +- .../sign/ModelSignatureConstants$$DType$.html | 8 +- ...atureConstants$$DecoderAttentionMask$.html | 8 +- ...ureConstants$$DecoderCachedCache1Key$.html | 8 +- ...ureConstants$$DecoderCachedCache2Key$.html | 8 +- ...ts$$DecoderCachedEncoderAttentionKey$.html | 8 +- ...stants$$DecoderCachedEncoderStateKey$.html | 8 +- ...eConstants$$DecoderCachedInputIdsKey$.html | 8 +- ...natureConstants$$DecoderCachedOutput$.html | 8 +- ...stants$$DecoderCachedOutputCache1Key$.html | 8 +- ...stants$$DecoderCachedOutputCache2Key$.html | 8 +- ...ureConstants$$DecoderCachedOutputKey$.html | 8 +- ...nstants$$DecoderEncoderAttentionMask$.html | 8 +- ...ureConstants$$DecoderEncoderInputIds$.html | 8 +- ...onstants$$DecoderInitOutputCache1Key$.html | 8 +- ...onstants$$DecoderInitOutputCache2Key$.html | 8 +- ...lSignatureConstants$$DecoderInputIds$.html | 8 +- ...delSignatureConstants$$DecoderOutput$.html | 8 +- .../ModelSignatureConstants$$DimCount$.html | 8 +- ...atureConstants$$EncoderAttentionMask$.html | 8 +- ...gnatureConstants$$EncoderContextMask$.html | 8 +- ...lSignatureConstants$$EncoderInputIds$.html | 8 +- ...delSignatureConstants$$EncoderOutput$.html | 8 +- ...lSignatureConstants$$EndLogitsOutput$.html | 8 +- ...ignatureConstants$$InitCachedOutPut2$.html | 8 +- ...ignatureConstants$$InitCachedOutput1$.html | 8 +- ...nts$$InitDecoderEncoderAttentionMask$.html | 8 +- ...onstants$$InitDecoderEncoderInputIds$.html | 8 +- ...natureConstants$$InitDecoderInputIds$.html | 8 +- ...SignatureConstants$$InitLogitsOutput$.html | 8 +- .../ModelSignatureConstants$$InputIds$.html | 8 +- .../ModelSignatureConstants$$InputIdsV1$.html | 8 +- ...lSignatureConstants$$LastHiddenState$.html | 8 +- ...ignatureConstants$$LastHiddenStateV1$.html | 8 +- ...odelSignatureConstants$$LogitsOutput$.html | 8 +- .../sign/ModelSignatureConstants$$Name$.html | 8 +- ...SignatureConstants$$PixelValuesInput$.html | 8 +- ...odelSignatureConstants$$PoolerOutput$.html | 8 +- ...elSignatureConstants$$PoolerOutputV1$.html | 8 +- ...elSignatureConstants$$SerializedSize$.html | 8 +- ...odelSignatureConstants$$ShapeDimList$.html | 8 +- ...ignatureConstants$$StartLogitsOutput$.html | 8 +- ...lSignatureConstants$$TFInfoDescriptor.html | 8 +- ...lSignatureConstants$$TFInfoNameMapper.html | 8 +- ...stants$$TapasLogitsAggregationOutput$.html | 8 +- ...ignatureConstants$$TapasLogitsOutput$.html | 8 +- ...odelSignatureConstants$$TokenTypeIds$.html | 8 +- ...elSignatureConstants$$TokenTypeIdsV1$.html | 8 +- .../sign/ModelSignatureConstants$.html | 8 +- .../sign/ModelSignatureManager$.html | 8 +- .../ml/tensorflow/sign/index.html | 8 +- ...inAlg$$implicits$$ExtendedDenseMatrix.html | 8 +- .../ml/util/LinAlg$$implicits$.html | 8 +- .../api/com/johnsnowlabs/ml/util/LinAlg$.html | 99 +- .../ml/util/LoadExternalModel$.html | 8 +- .../com/johnsnowlabs/ml/util/ModelArch$.html | 8 +- .../com/johnsnowlabs/ml/util/ModelEngine.html | 8 +- docs/api/com/johnsnowlabs/ml/util/ONNX$.html | 8 +- .../com/johnsnowlabs/ml/util/PyTorch$.html | 8 +- .../com/johnsnowlabs/ml/util/TensorFlow$.html | 8 +- .../com/johnsnowlabs/ml/util/Unknown$.html | 8 +- docs/api/com/johnsnowlabs/ml/util/index.html | 8 +- .../johnsnowlabs/nlp/ActivationFunction$.html | 8 +- .../nlp/Annotation$$AnnotationContainer.html | 8 +- ...nnotation$$extractors$$AnnotationData.html | 8 +- .../nlp/Annotation$$extractors$.html | 8 +- .../api/com/johnsnowlabs/nlp/Annotation$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Annotation.html | 8 +- .../AnnotationAudio$$AnnotationContainer.html | 8 +- .../nlp/AnnotationAudio$$AudioFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationAudio$.html | 8 +- .../com/johnsnowlabs/nlp/AnnotationAudio.html | 8 +- .../AnnotationImage$$AnnotationContainer.html | 8 +- .../nlp/AnnotationImage$$ImageFields.html | 8 +- .../johnsnowlabs/nlp/AnnotationImage$.html | 8 +- .../com/johnsnowlabs/nlp/AnnotationImage.html | 8 +- .../johnsnowlabs/nlp/AnnotatorApproach.html | 8 +- .../com/johnsnowlabs/nlp/AnnotatorModel.html | 10 +- .../com/johnsnowlabs/nlp/AnnotatorType$.html | 8 +- .../com/johnsnowlabs/nlp/AudioAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/AudioAssembler.html | 8 +- docs/api/com/johnsnowlabs/nlp/CanBeLazy.html | 10 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Doc2Chunk.html | 8 +- .../johnsnowlabs/nlp/DocumentAssembler$.html | 8 +- .../johnsnowlabs/nlp/DocumentAssembler.html | 8 +- .../johnsnowlabs/nlp/EmbeddingsFinisher$.html | 8 +- .../johnsnowlabs/nlp/EmbeddingsFinisher.html | 8 +- .../com/johnsnowlabs/nlp/FeaturesReader.html | 8 +- .../com/johnsnowlabs/nlp/FeaturesWriter.html | 8 +- docs/api/com/johnsnowlabs/nlp/Finisher$.html | 8 +- docs/api/com/johnsnowlabs/nlp/Finisher.html | 8 +- .../com/johnsnowlabs/nlp/GraphFinisher.html | 8 +- .../nlp/HasAudioFeatureProperties.html | 8 +- .../johnsnowlabs/nlp/HasBatchedAnnotate.html | 10 +- .../nlp/HasBatchedAnnotateAudio.html | 8 +- .../nlp/HasBatchedAnnotateImage.html | 8 +- .../nlp/HasCandidateLabelsProperties.html | 8 +- .../nlp/HasCaseSensitiveProperties.html | 10 +- .../HasClassifierActivationProperties.html | 8 +- .../nlp/HasEnableCachingProperties.html | 8 +- docs/api/com/johnsnowlabs/nlp/HasEngine.html | 10 +- .../api/com/johnsnowlabs/nlp/HasFeatures.html | 10 +- .../nlp/HasGeneratorProperties.html | 8 +- .../nlp/HasImageFeatureProperties.html | 8 +- .../nlp/HasInputAnnotationCols.html | 10 +- .../nlp/HasMultipleInputAnnotationCols.html | 8 +- .../nlp/HasOutputAnnotationCol.html | 10 +- .../nlp/HasOutputAnnotatorType.html | 10 +- .../com/johnsnowlabs/nlp/HasPretrained.html | 10 +- .../HasProtectedParams$ProtectedParam.html | 8 +- .../johnsnowlabs/nlp/HasProtectedParams.html | 10 +- .../com/johnsnowlabs/nlp/HasRecursiveFit.html | 8 +- .../nlp/HasRecursiveTransform.html | 8 +- .../johnsnowlabs/nlp/HasSimpleAnnotate.html | 8 +- .../api/com/johnsnowlabs/nlp/IAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/ImageAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/ImageAssembler.html | 8 +- .../com/johnsnowlabs/nlp/JavaAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/LightPipeline.html | 8 +- .../nlp/MultiDocumentAssembler$.html | 8 +- .../nlp/MultiDocumentAssembler.html | 8 +- .../nlp/ParamsAndFeaturesReadable.html | 10 +- .../nlp/ParamsAndFeaturesWritable.html | 10 +- .../com/johnsnowlabs/nlp/RawAnnotator.html | 10 +- .../johnsnowlabs/nlp/RecursivePipeline.html | 8 +- .../nlp/RecursivePipelineModel.html | 8 +- docs/api/com/johnsnowlabs/nlp/SparkNLP$.html | 8 +- .../com/johnsnowlabs/nlp/TableAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/TableAssembler.html | 8 +- .../com/johnsnowlabs/nlp/TokenAssembler$.html | 8 +- .../com/johnsnowlabs/nlp/TokenAssembler.html | 8 +- .../nlp/annotators/Chunk2Doc$.html | 12 +- .../nlp/annotators/Chunk2Doc.html | 12 +- .../nlp/annotators/ChunkTokenizer$.html | 12 +- .../nlp/annotators/ChunkTokenizer.html | 12 +- .../nlp/annotators/ChunkTokenizerModel$.html | 12 +- .../nlp/annotators/ChunkTokenizerModel.html | 12 +- .../johnsnowlabs/nlp/annotators/Chunker$.html | 12 +- .../johnsnowlabs/nlp/annotators/Chunker.html | 12 +- .../nlp/annotators/Date2Chunk$.html | 12 +- .../nlp/annotators/Date2Chunk.html | 12 +- .../nlp/annotators/DateMatcher$.html | 12 +- .../nlp/annotators/DateMatcher.html | 12 +- .../nlp/annotators/DateMatcherTranslator.html | 12 +- .../DateMatcherTranslatorPolicy.html | 12 +- .../nlp/annotators/DateMatcherUtils.html | 12 +- .../DocumentCharacterTextSplitter$.html | 1172 ++++++ .../DocumentCharacterTextSplitter.html | 20 +- .../nlp/annotators/DocumentNormalizer$.html | 12 +- .../nlp/annotators/DocumentNormalizer.html | 12 +- .../annotators/DocumentTokenSplitter$.html | 1172 ++++++ .../nlp/annotators/DocumentTokenSplitter.html | 20 +- .../nlp/annotators/EnglishStemmer$.html | 12 +- .../nlp/annotators/GraphExtraction.html | 12 +- .../nlp/annotators/Lemmatizer$.html | 12 +- .../nlp/annotators/Lemmatizer.html | 12 +- .../nlp/annotators/LemmatizerModel$.html | 12 +- .../nlp/annotators/LemmatizerModel.html | 12 +- .../nlp/annotators/LookAroundManager$.html | 12 +- .../nlp/annotators/MultiDateMatcher$.html | 12 +- .../nlp/annotators/MultiDateMatcher.html | 12 +- .../nlp/annotators/MultiDatePolicy$.html | 12 +- .../nlp/annotators/NGramGenerator$.html | 12 +- .../nlp/annotators/NGramGenerator.html | 12 +- .../nlp/annotators/Normalizer$.html | 12 +- .../nlp/annotators/Normalizer.html | 12 +- .../nlp/annotators/NormalizerModel$.html | 12 +- ...alizerModel$TokenizerAndNormalizerMap.html | 8 +- .../nlp/annotators/NormalizerModel.html | 12 +- .../annotators/PretrainedAnnotations$.html | 12 +- .../ReadablePretrainedLemmatizer.html | 12 +- ...adablePretrainedStopWordsCleanerModel.html | 12 +- .../ReadablePretrainedTextMatcher.html | 12 +- .../ReadablePretrainedTokenizer.html | 12 +- .../nlp/annotators/RecursiveTokenizer.html | 12 +- .../annotators/RecursiveTokenizerModel$.html | 12 +- .../annotators/RecursiveTokenizerModel.html | 12 +- .../nlp/annotators/RegexMatcher$.html | 12 +- .../nlp/annotators/RegexMatcher.html | 12 +- .../nlp/annotators/RegexMatcherModel$.html | 12 +- .../nlp/annotators/RegexMatcherModel.html | 12 +- .../nlp/annotators/RegexTokenizer$.html | 12 +- .../nlp/annotators/RegexTokenizer.html | 12 +- .../nlp/annotators/SingleDatePolicy$.html | 12 +- .../johnsnowlabs/nlp/annotators/Stemmer$.html | 12 +- .../johnsnowlabs/nlp/annotators/Stemmer.html | 12 +- .../nlp/annotators/StopWordsCleaner$.html | 12 +- .../nlp/annotators/StopWordsCleaner.html | 12 +- .../nlp/annotators/TextMatcher$.html | 12 +- .../nlp/annotators/TextMatcher.html | 12 +- .../nlp/annotators/TextMatcherModel$.html | 12 +- .../nlp/annotators/TextMatcherModel.html | 12 +- .../nlp/annotators/TextSplitter.html | 12 +- .../nlp/annotators/Token2Chunk$.html | 12 +- .../nlp/annotators/Token2Chunk.html | 12 +- .../nlp/annotators/Tokenizer$.html | 12 +- .../nlp/annotators/Tokenizer.html | 12 +- .../nlp/annotators/TokenizerModel$.html | 12 +- .../nlp/annotators/TokenizerModel.html | 12 +- .../nlp/annotators/audio/HubertForCTC$.html | 8 +- .../nlp/annotators/audio/HubertForCTC.html | 8 +- .../audio/ReadHubertForAudioDLModel.html | 8 +- .../audio/ReadWav2Vec2ForAudioDLModel.html | 8 +- .../audio/ReadWhisperForCTCDLModel.html | 8 +- ...ReadablePretrainedHubertForAudioModel.html | 8 +- ...adablePretrainedWav2Vec2ForAudioModel.html | 8 +- .../ReadablePretrainedWhisperForCTCModel.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC$.html | 8 +- .../nlp/annotators/audio/Wav2Vec2ForCTC.html | 8 +- .../nlp/annotators/audio/WhisperForCTC$.html | 8 +- .../nlp/annotators/audio/WhisperForCTC.html | 8 +- .../audio/feature_extractor/AudioUtils$.html | 8 +- .../PreprocessorAttributes$.html | 8 +- .../WhisperPreprocessor.html | 8 +- .../audio/feature_extractor/index.html | 8 +- .../nlp/annotators/audio/index.html | 8 +- .../nlp/annotators/btm/BigTextMatcher$.html | 8 +- .../nlp/annotators/btm/BigTextMatcher.html | 8 +- .../annotators/btm/BigTextMatcherModel$.html | 8 +- .../annotators/btm/BigTextMatcherModel.html | 8 +- .../btm/ReadablePretrainedBigTextMatcher.html | 8 +- .../nlp/annotators/btm/TMEdgesReadWriter.html | 8 +- .../nlp/annotators/btm/TMEdgesReader.html | 8 +- .../nlp/annotators/btm/TMNodesReader.html | 8 +- .../nlp/annotators/btm/TMNodesWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReadWriter.html | 8 +- .../nlp/annotators/btm/TMVocabReader.html | 8 +- .../nlp/annotators/btm/TrieNode.html | 8 +- .../nlp/annotators/btm/index.html | 8 +- .../dl/AlbertForQuestionAnswering$.html | 8 +- .../dl/AlbertForQuestionAnswering.html | 8 +- .../dl/AlbertForSequenceClassification$.html | 8 +- .../dl/AlbertForSequenceClassification.html | 8 +- .../dl/AlbertForTokenClassification$.html | 8 +- .../dl/AlbertForTokenClassification.html | 8 +- .../dl/BartForZeroShotClassification$.html | 8 +- .../dl/BartForZeroShotClassification.html | 8 +- .../dl/BertForQuestionAnswering$.html | 8 +- .../dl/BertForQuestionAnswering.html | 8 +- .../dl/BertForSequenceClassification$.html | 8 +- .../dl/BertForSequenceClassification.html | 8 +- .../dl/BertForTokenClassification$.html | 8 +- .../dl/BertForTokenClassification.html | 8 +- .../dl/BertForZeroShotClassification$.html | 8 +- .../dl/BertForZeroShotClassification.html | 8 +- .../dl/CamemBertForQuestionAnswering$.html | 8 +- .../dl/CamemBertForQuestionAnswering.html | 8 +- .../CamemBertForSequenceClassification$.html | 8 +- .../CamemBertForSequenceClassification.html | 8 +- .../dl/CamemBertForTokenClassification$.html | 8 +- .../dl/CamemBertForTokenClassification.html | 8 +- .../classifier/dl/ClassifierDLApproach$.html | 8 +- .../classifier/dl/ClassifierDLApproach.html | 8 +- .../classifier/dl/ClassifierDLModel$.html | 8 +- .../classifier/dl/ClassifierDLModel.html | 8 +- .../classifier/dl/ClassifierEncoder.html | 8 +- .../classifier/dl/ClassifierMetrics.html | 8 +- .../dl/DeBertaForQuestionAnswering$.html | 62 +- .../dl/DeBertaForQuestionAnswering.html | 54 +- .../dl/DeBertaForSequenceClassification$.html | 62 +- .../dl/DeBertaForSequenceClassification.html | 54 +- .../dl/DeBertaForTokenClassification$.html | 62 +- .../dl/DeBertaForTokenClassification.html | 54 +- .../dl/DistilBertForQuestionAnswering$.html | 8 +- .../dl/DistilBertForQuestionAnswering.html | 8 +- .../DistilBertForSequenceClassification$.html | 8 +- .../DistilBertForSequenceClassification.html | 8 +- .../dl/DistilBertForTokenClassification$.html | 8 +- .../dl/DistilBertForTokenClassification.html | 8 +- .../DistilBertForZeroShotClassification$.html | 8 +- .../DistilBertForZeroShotClassification.html | 8 +- .../dl/LongformerForQuestionAnswering$.html | 8 +- .../dl/LongformerForQuestionAnswering.html | 8 +- .../LongformerForSequenceClassification$.html | 8 +- .../LongformerForSequenceClassification.html | 8 +- .../dl/LongformerForTokenClassification$.html | 8 +- .../dl/LongformerForTokenClassification.html | 8 +- .../dl/MultiClassifierDLApproach.html | 8 +- .../dl/MultiClassifierDLModel$.html | 8 +- .../classifier/dl/MultiClassifierDLModel.html | 8 +- ...ReadAlbertForQuestionAnsweringDLModel.html | 8 +- .../dl/ReadAlbertForSequenceDLModel.html | 8 +- .../dl/ReadAlbertForTokenDLModel.html | 8 +- .../dl/ReadBartForZeroShotDLModel.html | 8 +- .../ReadBertForQuestionAnsweringDLModel.html | 8 +- .../dl/ReadBertForSequenceDLModel.html | 8 +- .../dl/ReadBertForTokenDLModel.html | 8 +- .../dl/ReadBertForZeroShotDLModel.html | 8 +- .../dl/ReadCamemBertForQADLModel.html | 8 +- .../dl/ReadCamemBertForSequenceDLModel.html | 8 +- .../dl/ReadCamemBertForTokenDLModel.html | 8 +- .../dl/ReadClassifierDLTensorflowModel.html | 8 +- ...eadDeBertaForQuestionAnsweringDLModel.html | 64 +- .../dl/ReadDeBertaForSequenceDLModel.html | 64 +- .../dl/ReadDeBertaForTokenDLModel.html | 64 +- ...DistilBertForQuestionAnsweringDLModel.html | 8 +- .../dl/ReadDistilBertForSequenceDLModel.html | 8 +- .../dl/ReadDistilBertForTokenDLModel.html | 8 +- .../dl/ReadDistilBertForZeroShotDLModel.html | 8 +- ...LongformerForQuestionAnsweringDLModel.html | 8 +- .../dl/ReadLongformerForSequenceDLModel.html | 8 +- .../dl/ReadLongformerForTokenDLModel.html | 8 +- .../ReadMultiClassifierDLTensorflowModel.html | 8 +- ...eadRoBertaForQuestionAnsweringDLModel.html | 8 +- .../dl/ReadRoBertaForSequenceDLModel.html | 8 +- .../dl/ReadRoBertaForTokenDLModel.html | 8 +- .../dl/ReadRoBertaForZeroShotDLModel.html | 8 +- .../dl/ReadSentimentDLTensorflowModel.html | 8 +- .../ReadTapasForQuestionAnsweringDLModel.html | 8 +- ...XlmRoBertaForQuestionAnsweringDLModel.html | 8 +- .../dl/ReadXlmRoBertaForSequenceDLModel.html | 8 +- .../dl/ReadXlmRoBertaForTokenDLModel.html | 8 +- .../dl/ReadXlmRoBertaForZeroShotDLModel.html | 8 +- .../dl/ReadXlnetForSequenceDLModel.html | 8 +- .../dl/ReadXlnetForTokenDLModel.html | 8 +- .../ReadablePretrainedAlbertForQAModel.html | 8 +- ...dablePretrainedAlbertForSequenceModel.html | 8 +- ...ReadablePretrainedAlbertForTokenModel.html | 8 +- ...eadablePretrainedBartForZeroShotModel.html | 8 +- .../dl/ReadablePretrainedBertForQAModel.html | 8 +- ...eadablePretrainedBertForSequenceModel.html | 8 +- .../ReadablePretrainedBertForTokenModel.html | 8 +- ...eadablePretrainedBertForZeroShotModel.html | 8 +- ...ReadablePretrainedCamemBertForQAModel.html | 8 +- ...lePretrainedCamemBertForSequenceModel.html | 8 +- ...dablePretrainedCamemBertForTokenModel.html | 8 +- .../dl/ReadablePretrainedClassifierDL.html | 8 +- .../ReadablePretrainedDeBertaForQAModel.html | 8 +- ...ablePretrainedDeBertaForSequenceModel.html | 8 +- ...eadablePretrainedDeBertaForTokenModel.html | 8 +- ...eadablePretrainedDistilBertForQAModel.html | 8 +- ...ePretrainedDistilBertForSequenceModel.html | 8 +- ...ablePretrainedDistilBertForTokenModel.html | 8 +- ...ePretrainedDistilBertForZeroShotModel.html | 8 +- ...eadablePretrainedLongformerForQAModel.html | 8 +- ...ePretrainedLongformerForSequenceModel.html | 8 +- ...ablePretrainedLongformerForTokenModel.html | 8 +- .../ReadablePretrainedMultiClassifierDL.html | 8 +- .../ReadablePretrainedRoBertaForQAModel.html | 8 +- ...ablePretrainedRoBertaForSequenceModel.html | 8 +- ...eadablePretrainedRoBertaForTokenModel.html | 8 +- ...ablePretrainedRoBertaForZeroShotModel.html | 8 +- .../dl/ReadablePretrainedSentimentDL.html | 8 +- .../dl/ReadablePretrainedTapasForQAModel.html | 8 +- ...eadablePretrainedXlmRoBertaForQAModel.html | 8 +- ...ePretrainedXlmRoBertaForSequenceModel.html | 8 +- ...ablePretrainedXlmRoBertaForTokenModel.html | 8 +- ...ePretrainedXlmRoBertaForZeroShotModel.html | 8 +- ...adablePretrainedXlnetForSequenceModel.html | 8 +- .../ReadablePretrainedXlnetForTokenModel.html | 8 +- .../dl/RoBertaForQuestionAnswering$.html | 8 +- .../dl/RoBertaForQuestionAnswering.html | 8 +- .../dl/RoBertaForSequenceClassification$.html | 8 +- .../dl/RoBertaForSequenceClassification.html | 8 +- .../dl/RoBertaForTokenClassification$.html | 8 +- .../dl/RoBertaForTokenClassification.html | 8 +- .../dl/RoBertaForZeroShotClassification$.html | 8 +- .../dl/RoBertaForZeroShotClassification.html | 8 +- .../classifier/dl/SentimentApproach$.html | 8 +- .../classifier/dl/SentimentDLApproach.html | 8 +- .../classifier/dl/SentimentDLModel$.html | 8 +- .../classifier/dl/SentimentDLModel.html | 8 +- .../dl/TapasForQuestionAnswering$.html | 8 +- .../dl/TapasForQuestionAnswering.html | 8 +- .../dl/XlmRoBertaForQuestionAnswering$.html | 8 +- .../dl/XlmRoBertaForQuestionAnswering.html | 8 +- .../XlmRoBertaForSequenceClassification$.html | 8 +- .../XlmRoBertaForSequenceClassification.html | 8 +- .../dl/XlmRoBertaForTokenClassification$.html | 8 +- .../dl/XlmRoBertaForTokenClassification.html | 8 +- .../XlmRoBertaForZeroShotClassification$.html | 8 +- .../XlmRoBertaForZeroShotClassification.html | 8 +- .../dl/XlnetForSequenceClassification$.html | 8 +- .../dl/XlnetForSequenceClassification.html | 8 +- .../dl/XlnetForTokenClassification$.html | 8 +- .../dl/XlnetForTokenClassification.html | 8 +- .../nlp/annotators/classifier/dl/index.html | 32 +- .../nlp/annotators/classifier/index.html | 8 +- .../nlp/annotators/common/Annotated$.html | 8 +- .../nlp/annotators/common/Annotated.html | 8 +- .../nlp/annotators/common/ChunkSplit$.html | 8 +- .../nlp/annotators/common/ConllSentence.html | 8 +- .../DatasetHelpers$$DataFrameHelper.html | 8 +- .../annotators/common/DatasetHelpers$.html | 8 +- .../annotators/common/DependencyParsed$.html | 8 +- .../common/DependencyParsedSentence.html | 8 +- .../common/EmbeddingsWithSentence$.html | 8 +- .../annotators/common/IndexedTaggedWord.html | 8 +- .../nlp/annotators/common/IndexedToken.html | 8 +- .../nlp/annotators/common/InfixToken$.html | 8 +- .../nlp/annotators/common/InfixToken.html | 8 +- .../LabeledDependency$$DependencyInfo.html | 8 +- .../annotators/common/LabeledDependency$.html | 8 +- .../nlp/annotators/common/NerTagged$.html | 8 +- .../nlp/annotators/common/PosTagged$.html | 8 +- .../nlp/annotators/common/PrefixedToken$.html | 8 +- .../nlp/annotators/common/PrefixedToken.html | 8 +- .../common/PreprocessingParser.html | 8 +- .../nlp/annotators/common/Sentence$.html | 8 +- .../nlp/annotators/common/Sentence.html | 8 +- .../nlp/annotators/common/SentenceSplit$.html | 8 +- .../nlp/annotators/common/SuffixedToken$.html | 8 +- .../nlp/annotators/common/SuffixedToken.html | 8 +- .../nlp/annotators/common/TableData$.html | 8 +- .../nlp/annotators/common/TableData.html | 8 +- .../nlp/annotators/common/Tagged.html | 8 +- .../annotators/common/TaggedSentence$.html | 8 +- .../nlp/annotators/common/TaggedSentence.html | 8 +- .../nlp/annotators/common/TaggedWord.html | 8 +- .../nlp/annotators/common/TokenPiece.html | 8 +- .../common/TokenPieceEmbeddings$.html | 8 +- .../common/TokenPieceEmbeddings.html | 8 +- .../annotators/common/TokenizedSentence.html | 8 +- .../common/TokenizedWithSentence$.html | 8 +- .../annotators/common/WordWithDependency.html | 8 +- .../common/WordpieceEmbeddingsSentence$.html | 8 +- .../common/WordpieceEmbeddingsSentence.html | 8 +- .../common/WordpieceTokenized$.html | 8 +- .../common/WordpieceTokenizedSentence.html | 8 +- .../nlp/annotators/common/index.html | 8 +- .../ReadSpanBertCorefTensorflowModel.html | 8 +- .../ReadablePretrainedSpanBertCorefModel.html | 8 +- .../annotators/coref/SpanBertCorefModel$.html | 8 +- .../annotators/coref/SpanBertCorefModel.html | 8 +- .../nlp/annotators/coref/index.html | 8 +- .../cv/CLIPForZeroShotClassification$.html | 8 +- .../cv/CLIPForZeroShotClassification.html | 8 +- .../cv/ConvNextForImageClassification$.html | 8 +- .../cv/ConvNextForImageClassification.html | 8 +- .../nlp/annotators/cv/HasRescaleFactor.html | 8 +- ...eadCLIPForZeroShotClassificationModel.html | 8 +- .../cv/ReadConvNextForImageDLModel.html | 8 +- .../cv/ReadSwinForImageDLModel.html | 8 +- .../annotators/cv/ReadViTForImageDLModel.html | 8 +- .../cv/ReadVisionEncoderDecoderDLModel.html | 8 +- ...nedCLIPForZeroShotClassificationModel.html | 8 +- ...adablePretrainedConvNextForImageModel.html | 8 +- .../ReadablePretrainedSwinForImageModel.html | 8 +- .../ReadablePretrainedViTForImageModel.html | 8 +- ...lePretrainedVisionEncoderDecoderModel.html | 8 +- .../cv/SwinForImageClassification$.html | 8 +- .../cv/SwinForImageClassification.html | 8 +- .../cv/ViTForImageClassification$.html | 8 +- .../cv/ViTForImageClassification.html | 8 +- ...sionEncoderDecoderForImageCaptioning$.html | 8 +- ...isionEncoderDecoderForImageCaptioning.html | 8 +- .../johnsnowlabs/nlp/annotators/cv/index.html | 8 +- .../er/AhoCorasickAutomaton$Node.html | 8 +- .../annotators/er/AhoCorasickAutomaton.html | 8 +- .../nlp/annotators/er/EntityPattern.html | 8 +- .../annotators/er/EntityRulerApproach.html | 8 +- .../annotators/er/EntityRulerFeatures.html | 8 +- .../nlp/annotators/er/EntityRulerModel$.html | 8 +- .../nlp/annotators/er/EntityRulerModel.html | 8 +- .../nlp/annotators/er/EntityRulerUtil$.html | 8 +- .../annotators/er/FlattenEntityPattern.html | 8 +- .../nlp/annotators/er/PatternsReadWriter.html | 8 +- .../nlp/annotators/er/PatternsReader.html | 8 +- .../er/ReadablePretrainedEntityRuler.html | 8 +- .../er/RegexPatternsReadWriter.html | 8 +- .../annotators/er/RegexPatternsReader.html | 8 +- .../johnsnowlabs/nlp/annotators/er/index.html | 8 +- .../johnsnowlabs/nlp/annotators/index.html | 48 +- .../nlp/annotators/keyword/index.html | 8 +- .../keyword/yake/YakeKeywordExtraction$.html | 8 +- .../keyword/yake/YakeKeywordExtraction.html | 8 +- .../annotators/keyword/yake/YakeParams.html | 8 +- .../nlp/annotators/keyword/yake/index.html | 8 +- .../annotators/keyword/yake/util/Token.html | 8 +- .../keyword/yake/util/Utilities$.html | 8 +- .../annotators/keyword/yake/util/index.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL$.html | 8 +- .../annotators/ld/dl/LanguageDetectorDL.html | 8 +- ...ReadLanguageDetectorDLTensorflowModel.html | 8 +- ...ablePretrainedLanguageDetectorDLModel.html | 8 +- .../nlp/annotators/ld/dl/index.html | 8 +- .../johnsnowlabs/nlp/annotators/ld/index.html | 8 +- .../nlp/annotators/ner/ModelMetrics$.html | 8 +- .../nlp/annotators/ner/NamedEntity.html | 8 +- .../nlp/annotators/ner/NerApproach.html | 8 +- .../nlp/annotators/ner/NerConverter$.html | 8 +- .../nlp/annotators/ner/NerConverter.html | 8 +- .../nlp/annotators/ner/NerOverwriter$.html | 8 +- .../nlp/annotators/ner/NerOverwriter.html | 8 +- .../nlp/annotators/ner/NerTagsEncoding$.html | 8 +- .../nlp/annotators/ner/Verbose$.html | 8 +- .../ner/crf/DictionaryFeatures$.html | 8 +- .../ner/crf/DictionaryFeatures.html | 8 +- .../ner/crf/FeatureGenerator$TokenType$.html | 8 +- .../annotators/ner/crf/FeatureGenerator.html | 8 +- .../annotators/ner/crf/NerCrfApproach$.html | 8 +- .../annotators/ner/crf/NerCrfApproach.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel$.html | 8 +- .../nlp/annotators/ner/crf/NerCrfModel.html | 8 +- .../ner/crf/ReadablePretrainedNerCrf.html | 8 +- .../nlp/annotators/ner/crf/index.html | 8 +- .../nlp/annotators/ner/dl/LoadsContrib$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach$.html | 8 +- .../nlp/annotators/ner/dl/NerDLApproach.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel$.html | 8 +- .../nlp/annotators/ner/dl/NerDLModel.html | 8 +- .../ner/dl/NerDLModelPythonReader$.html | 8 +- .../ner/dl/ReadZeroShotNerDLModel.html | 8 +- .../ner/dl/ReadablePretrainedNerDL.html | 8 +- .../ner/dl/ReadablePretrainedZeroShotNer.html | 8 +- .../nlp/annotators/ner/dl/ReadsNERGraph.html | 8 +- .../annotators/ner/dl/WithGraphResolver.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel$.html | 8 +- .../annotators/ner/dl/ZeroShotNerModel.html | 8 +- .../nlp/annotators/ner/dl/index.html | 8 +- .../nlp/annotators/ner/index.html | 8 +- ...lizableFormat$$SerializableDateFormat.html | 8 +- .../AnnotatorParam$SerializableFormat$.html | 8 +- .../nlp/annotators/param/AnnotatorParam.html | 8 +- .../annotators/param/EvaluationDLParams.html | 8 +- .../param/ExternalResourceParam.html | 8 +- .../param/SerializedAnnotatorComponent.html | 8 +- .../param/WritableAnnotatorComponent.html | 8 +- .../nlp/annotators/param/index.html | 8 +- .../parser/dep/DependencyParserApproach$.html | 8 +- .../parser/dep/DependencyParserApproach.html | 8 +- .../parser/dep/DependencyParserModel$.html | 8 +- .../parser/dep/DependencyParserModel.html | 8 +- .../GreedyTransition/DependencyMaker$.html | 8 +- .../DependencyMaker$CurrentState.html | 8 +- .../DependencyMaker$ParseState.html | 8 +- .../dep/GreedyTransition/DependencyMaker.html | 8 +- .../GreedyTransitionApproach$.html | 8 +- .../parser/dep/GreedyTransition/index.html | 8 +- .../GreedyTransition/package$$Feature.html | 8 +- .../GreedyTransition/package$$WordData.html | 8 +- .../parser/dep/Perceptron$WeightLearner.html | 8 +- .../nlp/annotators/parser/dep/Perceptron.html | 8 +- .../dep/ReadablePretrainedDependency.html | 8 +- .../annotators/parser/dep/TagDictionary$.html | 8 +- .../nlp/annotators/parser/dep/Tagger$.html | 8 +- .../nlp/annotators/parser/dep/Tagger.html | 8 +- .../nlp/annotators/parser/dep/index.html | 8 +- .../nlp/annotators/parser/index.html | 8 +- .../annotators/parser/typdep/ConllData.html | 8 +- .../parser/typdep/DependencyArcList.html | 8 +- .../parser/typdep/DependencyInstance.html | 8 +- .../parser/typdep/DependencyPipe.html | 8 +- .../parser/typdep/LocalFeatureData.html | 8 +- .../parser/typdep/LowRankTensor.html | 8 +- .../nlp/annotators/parser/typdep/Options.html | 8 +- .../annotators/parser/typdep/Parameters.html | 8 +- .../parser/typdep/PredictionParameters.html | 8 +- .../ReadablePretrainedTypedDependency.html | 8 +- .../parser/typdep/TrainDependencies.html | 8 +- .../annotators/parser/typdep/TrainFile.html | 8 +- .../parser/typdep/TypedDependencyParser.html | 8 +- .../TypedDependencyParserApproach$.html | 8 +- .../typdep/TypedDependencyParserApproach.html | 8 +- .../typdep/TypedDependencyParserModel$.html | 8 +- .../typdep/TypedDependencyParserModel.html | 8 +- .../typdep/feature/FeatureTemplate.html | 8 +- .../feature/SyntacticFeatureFactory.html | 8 +- .../parser/typdep/feature/index.html | 8 +- .../nlp/annotators/parser/typdep/index.html | 8 +- .../parser/typdep/io/Conll09Reader.html | 8 +- .../parser/typdep/io/ConllUReader.html | 8 +- .../parser/typdep/io/ConllWriter.html | 8 +- .../parser/typdep/io/DependencyReader.html | 8 +- .../annotators/parser/typdep/io/index.html | 8 +- .../parser/typdep/util/Alphabet.html | 8 +- .../parser/typdep/util/Collector.html | 8 +- .../parser/typdep/util/DependencyLabel.html | 8 +- .../parser/typdep/util/Dictionary.html | 8 +- .../parser/typdep/util/DictionarySet.html | 8 +- .../parser/typdep/util/FeatureVector.html | 8 +- .../parser/typdep/util/ScoreCollector.html | 8 +- .../annotators/parser/typdep/util/Utils.html | 8 +- .../annotators/parser/typdep/util/index.html | 8 +- .../nlp/annotators/pos/index.html | 8 +- .../pos/perceptron/AveragedPerceptron.html | 8 +- .../pos/perceptron/PerceptronApproach$.html | 8 +- .../pos/perceptron/PerceptronApproach.html | 8 +- .../PerceptronApproachDistributed$.html | 8 +- .../PerceptronApproachDistributed.html | 8 +- .../pos/perceptron/PerceptronModel$.html | 8 +- .../pos/perceptron/PerceptronModel.html | 8 +- .../perceptron/PerceptronPredictionUtils.html | 8 +- .../perceptron/PerceptronTrainingUtils.html | 8 +- .../pos/perceptron/PerceptronUtils.html | 8 +- .../ReadablePretrainedPerceptron.html | 8 +- .../StringMapStringDoubleAccumulator.html | 8 +- .../perceptron/TrainingPerceptronLegacy.html | 8 +- .../TupleKeyLongDoubleMapAccumulator.html | 8 +- .../nlp/annotators/pos/perceptron/index.html | 8 +- .../sbd/SentenceDetectorParams.html | 8 +- .../nlp/annotators/sbd/index.html | 8 +- .../sbd/pragmatic/CustomPragmaticMethod.html | 8 +- .../sbd/pragmatic/DefaultPragmaticMethod.html | 8 +- .../sbd/pragmatic/MixedPragmaticMethod.html | 8 +- .../pragmatic/PragmaticContentFormatter$.html | 8 +- .../pragmatic/PragmaticContentFormatter.html | 8 +- .../sbd/pragmatic/PragmaticDictionaries$.html | 8 +- .../sbd/pragmatic/PragmaticMethod.html | 8 +- .../pragmatic/PragmaticSentenceExtractor.html | 8 +- .../sbd/pragmatic/PragmaticSymbols$.html | 8 +- .../annotators/sbd/pragmatic/RuleSymbols.html | 8 +- .../sbd/pragmatic/SentenceDetector$.html | 8 +- .../sbd/pragmatic/SentenceDetector.html | 8 +- .../nlp/annotators/sbd/pragmatic/index.html | 8 +- .../nlp/annotators/sda/index.html | 8 +- .../sda/pragmatic/PragmaticScorer.html | 8 +- .../sda/pragmatic/SentimentDetector$.html | 8 +- .../sda/pragmatic/SentimentDetector.html | 8 +- .../pragmatic/SentimentDetectorModel$.html | 8 +- .../sda/pragmatic/SentimentDetectorModel.html | 8 +- .../nlp/annotators/sda/pragmatic/index.html | 8 +- .../sda/vivekn/ReadablePretrainedVivekn.html | 8 +- .../sda/vivekn/ViveknSentimentApproach.html | 8 +- .../sda/vivekn/ViveknSentimentModel$.html | 8 +- .../sda/vivekn/ViveknSentimentModel.html | 8 +- .../sda/vivekn/ViveknSentimentUtils.html | 8 +- .../nlp/annotators/sda/vivekn/index.html | 8 +- .../sentence_detector_dl/Metrics.html | 8 +- .../ReadablePretrainedSentenceDetectorDL.html | 8 +- .../ReadsSentenceDetectorDLGraph.html | 8 +- .../SentenceDetectorDLApproach.html | 8 +- .../SentenceDetectorDLEncoder$.html | 8 +- .../SentenceDetectorDLEncoder.html | 8 +- .../SentenceDetectorDLEncoderParam.html | 8 +- .../SentenceDetectorDLModel$.html | 8 +- .../SentenceDetectorDLModel.html | 8 +- .../sentence_detector_dl/index.html | 8 +- .../annotators/seq2seq/BartTransformer$.html | 8 +- .../annotators/seq2seq/BartTransformer.html | 8 +- .../annotators/seq2seq/GPT2Transformer$.html | 8 +- .../annotators/seq2seq/GPT2Transformer.html | 8 +- .../seq2seq/MarianTransformer$.html | 8 +- .../annotators/seq2seq/MarianTransformer.html | 8 +- .../seq2seq/ReadBartTransformerDLModel.html | 8 +- .../seq2seq/ReadGPT2TransformerDLModel.html | 8 +- .../seq2seq/ReadMarianMTDLModel.html | 8 +- .../seq2seq/ReadT5TransformerDLModel.html | 8 +- ...eadablePretrainedBartTransformerModel.html | 8 +- ...eadablePretrainedGPT2TransformerModel.html | 8 +- .../ReadablePretrainedMarianMTModel.html | 8 +- .../ReadablePretrainedT5TransformerModel.html | 8 +- .../annotators/seq2seq/T5Transformer$.html | 8 +- .../nlp/annotators/seq2seq/T5Transformer.html | 8 +- .../nlp/annotators/seq2seq/index.html | 8 +- .../DocumentSimilarityRankerApproach$.html | 8 +- .../DocumentSimilarityRankerApproach.html | 8 +- .../DocumentSimilarityRankerModel$.html | 8 +- .../DocumentSimilarityRankerModel.html | 8 +- .../similarity/IndexedNeighbors.html | 8 +- .../IndexedNeighborsWithDistance.html | 8 +- .../similarity/NeighborAnnotation.html | 8 +- .../similarity/NeighborsResultSet.html | 8 +- .../ReadableDocumentSimilarityRanker.html | 8 +- .../nlp/annotators/similarity/index.html | 8 +- .../spell/context/CandidateStrategy$.html | 8 +- ...ntextSpellCheckerApproach$ArrayHelper.html | 8 +- .../context/ContextSpellCheckerApproach.html | 8 +- .../context/ContextSpellCheckerModel$.html | 8 +- .../ContextSpellCheckerModel$StringTools.html | 8 +- .../context/ContextSpellCheckerModel.html | 8 +- .../spell/context/HasTransducerFeatures.html | 8 +- .../spell/context/LangModelSentence.html | 8 +- .../ReadablePretrainedContextSpell.html | 8 +- .../context/ReadsLanguageModelGraph.html | 8 +- .../spell/context/WeightedLevenshtein.html | 8 +- .../nlp/annotators/spell/context/index.html | 8 +- .../spell/context/parser/AgeToken.html | 8 +- .../spell/context/parser/DateToken.html | 8 +- .../context/parser/GenericRegexParser.html | 8 +- .../context/parser/GenericVocabParser.html | 8 +- .../spell/context/parser/LocationClass.html | 8 +- .../spell/context/parser/MainVocab.html | 8 +- .../spell/context/parser/MedicationClass.html | 8 +- .../spell/context/parser/NamesClass.html | 8 +- .../spell/context/parser/NumberToken.html | 8 +- .../spell/context/parser/RegexParser.html | 8 +- .../context/parser/SerializableClass.html | 8 +- .../context/parser/SpecialClassParser.html | 8 +- .../context/parser/TransducerSeqFeature.html | 8 +- .../spell/context/parser/UnitToken.html | 8 +- .../spell/context/parser/VocabParser.html | 8 +- .../spell/context/parser/index.html | 8 +- .../nlp/annotators/spell/index.html | 8 +- .../spell/norvig/NorvigSweetingApproach$.html | 8 +- .../spell/norvig/NorvigSweetingApproach.html | 8 +- .../spell/norvig/NorvigSweetingModel$.html | 8 +- .../spell/norvig/NorvigSweetingModel.html | 8 +- .../spell/norvig/NorvigSweetingParams.html | 8 +- .../norvig/ReadablePretrainedNorvig.html | 8 +- .../nlp/annotators/spell/norvig/index.html | 8 +- .../ReadablePretrainedSymmetric.html | 8 +- .../symmetric/SymmetricDeleteApproach$.html | 8 +- .../symmetric/SymmetricDeleteApproach.html | 8 +- .../symmetric/SymmetricDeleteModel$.html | 8 +- .../SymmetricDeleteModel$SuggestedWord.html | 8 +- .../spell/symmetric/SymmetricDeleteModel.html | 8 +- .../symmetric/SymmetricDeleteParams.html | 8 +- .../nlp/annotators/spell/symmetric/index.html | 8 +- .../nlp/annotators/spell/util/Utilities$.html | 8 +- .../nlp/annotators/spell/util/index.html | 8 +- .../nlp/annotators/tapas/TapasCellDate$.html | 8 +- .../nlp/annotators/tapas/TapasCellDate.html | 8 +- .../nlp/annotators/tapas/TapasCellValue$.html | 8 +- .../nlp/annotators/tapas/TapasCellValue.html | 8 +- .../nlp/annotators/tapas/TapasEncoder.html | 8 +- .../nlp/annotators/tapas/TapasInputData.html | 8 +- .../tapas/TapasNumericRelation$.html | 8 +- .../tapas/TapasNumericValueSpan$.html | 8 +- .../tapas/TapasNumericValueSpan.html | 8 +- .../nlp/annotators/tapas/index.html | 8 +- .../tokenizer/bpe/BartTokenizer.html | 8 +- .../tokenizer/bpe/BpeTokenizer$.html | 8 +- .../tokenizer/bpe/CLIPTokenizer.html | 8 +- .../tokenizer/bpe/Gpt2Tokenizer.html | 8 +- .../tokenizer/bpe/RobertaTokenizer.html | 8 +- .../tokenizer/bpe/SpecialToken.html | 8 +- .../tokenizer/bpe/WhisperTokenDecoder.html | 8 +- .../nlp/annotators/tokenizer/bpe/index.html | 8 +- .../nlp/annotators/tokenizer/index.html | 8 +- .../ws/ReadablePretrainedWordSegmenter.html | 8 +- .../nlp/annotators/ws/TagsType$.html | 8 +- .../annotators/ws/WordSegmenterApproach$.html | 8 +- .../annotators/ws/WordSegmenterApproach.html | 8 +- .../annotators/ws/WordSegmenterModel$.html | 8 +- .../nlp/annotators/ws/WordSegmenterModel.html | 8 +- .../johnsnowlabs/nlp/annotators/ws/index.html | 8 +- .../nlp/embeddings/AlbertEmbeddings$.html | 20 +- .../nlp/embeddings/AlbertEmbeddings.html | 20 +- .../nlp/embeddings/BGEEmbeddings$.html | 1248 +++++++ .../nlp/embeddings/BGEEmbeddings.html | 3288 +++++++++++++++++ .../nlp/embeddings/BertEmbeddings$.html | 20 +- .../nlp/embeddings/BertEmbeddings.html | 20 +- .../embeddings/BertSentenceEmbeddings$.html | 20 +- .../embeddings/BertSentenceEmbeddings.html | 20 +- .../nlp/embeddings/CamemBertEmbeddings$.html | 20 +- .../nlp/embeddings/CamemBertEmbeddings.html | 20 +- .../nlp/embeddings/ChunkEmbeddings$.html | 20 +- .../nlp/embeddings/ChunkEmbeddings.html | 20 +- .../nlp/embeddings/DeBertaEmbeddings$.html | 20 +- .../nlp/embeddings/DeBertaEmbeddings.html | 20 +- .../nlp/embeddings/DistilBertEmbeddings$.html | 20 +- .../nlp/embeddings/DistilBertEmbeddings.html | 20 +- .../nlp/embeddings/Doc2VecApproach$.html | 20 +- .../nlp/embeddings/Doc2VecApproach.html | 20 +- .../nlp/embeddings/Doc2VecModel$.html | 20 +- .../nlp/embeddings/Doc2VecModel.html | 20 +- .../nlp/embeddings/E5Embeddings$.html | 20 +- .../nlp/embeddings/E5Embeddings.html | 20 +- .../nlp/embeddings/ElmoEmbeddings$.html | 20 +- .../nlp/embeddings/ElmoEmbeddings.html | 20 +- .../EmbeddingsCoverage$CoverageResult.html | 8 +- .../nlp/embeddings/EmbeddingsCoverage.html | 20 +- .../embeddings/HasEmbeddingsProperties.html | 22 +- .../nlp/embeddings/InstructorEmbeddings$.html | 20 +- .../nlp/embeddings/InstructorEmbeddings.html | 20 +- .../nlp/embeddings/LongformerEmbeddings$.html | 20 +- .../nlp/embeddings/LongformerEmbeddings.html | 20 +- .../nlp/embeddings/MPNetEmbeddings$.html | 20 +- .../nlp/embeddings/MPNetEmbeddings.html | 20 +- .../PoolingStrategy$$AnnotatorType$.html | 8 +- .../nlp/embeddings/PoolingStrategy$.html | 20 +- .../nlp/embeddings/ReadAlbertDLModel.html | 20 +- .../nlp/embeddings/ReadBGEDLModel.html | 1067 ++++++ .../nlp/embeddings/ReadBertDLModel.html | 20 +- .../embeddings/ReadBertSentenceDLModel.html | 20 +- .../nlp/embeddings/ReadCamemBertDLModel.html | 20 +- .../nlp/embeddings/ReadDeBertaDLModel.html | 20 +- .../nlp/embeddings/ReadDistilBertDLModel.html | 20 +- .../nlp/embeddings/ReadE5DLModel.html | 20 +- .../nlp/embeddings/ReadElmoDLModel.html | 20 +- .../nlp/embeddings/ReadInstructorDLModel.html | 20 +- .../nlp/embeddings/ReadLongformerDLModel.html | 20 +- .../nlp/embeddings/ReadMPNetDLModel.html | 20 +- .../nlp/embeddings/ReadRobertaDLModel.html | 20 +- .../ReadRobertaSentenceDLModel.html | 20 +- .../nlp/embeddings/ReadUSEDLModel.html | 20 +- .../nlp/embeddings/ReadXlmRobertaDLModel.html | 20 +- .../ReadXlmRobertaSentenceDLModel.html | 20 +- .../nlp/embeddings/ReadXlnetDLModel.html | 20 +- .../ReadablePretrainedAlbertModel.html | 20 +- .../ReadablePretrainedBGEModel.html | 1077 ++++++ .../ReadablePretrainedBertModel.html | 20 +- .../ReadablePretrainedBertSentenceModel.html | 20 +- .../ReadablePretrainedCamemBertModel.html | 20 +- .../ReadablePretrainedDeBertaModel.html | 20 +- .../ReadablePretrainedDistilBertModel.html | 20 +- .../embeddings/ReadablePretrainedDoc2Vec.html | 20 +- .../embeddings/ReadablePretrainedE5Model.html | 20 +- .../ReadablePretrainedElmoModel.html | 20 +- .../ReadablePretrainedInstructorModel.html | 20 +- .../ReadablePretrainedLongformerModel.html | 20 +- .../ReadablePretrainedMPNetModel.html | 20 +- .../ReadablePretrainedRobertaModel.html | 20 +- ...eadablePretrainedRobertaSentenceModel.html | 20 +- .../ReadablePretrainedUSEModel.html | 20 +- .../ReadablePretrainedWord2Vec.html | 20 +- .../ReadablePretrainedWordEmbeddings.html | 20 +- .../ReadablePretrainedXlmRobertaModel.html | 20 +- ...ablePretrainedXlmRobertaSentenceModel.html | 20 +- .../ReadablePretrainedXlnetModel.html | 20 +- .../nlp/embeddings/ReadsFromBytes.html | 20 +- .../nlp/embeddings/RoBertaEmbeddings$.html | 20 +- .../nlp/embeddings/RoBertaEmbeddings.html | 20 +- .../RoBertaSentenceEmbeddings$.html | 20 +- .../embeddings/RoBertaSentenceEmbeddings.html | 20 +- .../nlp/embeddings/SentenceEmbeddings$.html | 20 +- .../nlp/embeddings/SentenceEmbeddings.html | 20 +- .../embeddings/UniversalSentenceEncoder$.html | 20 +- .../embeddings/UniversalSentenceEncoder.html | 20 +- .../nlp/embeddings/Word2VecApproach$.html | 20 +- .../nlp/embeddings/Word2VecApproach.html | 20 +- .../nlp/embeddings/Word2VecModel$.html | 20 +- .../nlp/embeddings/Word2VecModel.html | 20 +- .../nlp/embeddings/WordEmbeddings$.html | 20 +- .../nlp/embeddings/WordEmbeddings.html | 20 +- .../WordEmbeddingsBinaryIndexer$.html | 20 +- .../nlp/embeddings/WordEmbeddingsModel$.html | 20 +- .../nlp/embeddings/WordEmbeddingsModel.html | 20 +- .../nlp/embeddings/WordEmbeddingsReader.html | 20 +- .../WordEmbeddingsTextIndexer$.html | 20 +- .../nlp/embeddings/WordEmbeddingsWriter.html | 20 +- .../nlp/embeddings/XlmRoBertaEmbeddings$.html | 20 +- .../nlp/embeddings/XlmRoBertaEmbeddings.html | 20 +- .../XlmRoBertaSentenceEmbeddings$.html | 20 +- .../XlmRoBertaSentenceEmbeddings.html | 20 +- .../nlp/embeddings/XlnetEmbeddings$.html | 20 +- .../nlp/embeddings/XlnetEmbeddings.html | 20 +- .../johnsnowlabs/nlp/embeddings/index.html | 144 +- .../DocumentSimilarityRankerFinisher$.html | 8 +- .../DocumentSimilarityRankerFinisher.html | 8 +- .../com/johnsnowlabs/nlp/finisher/index.html | 8 +- .../nlp/functions$$EachAnnotations.html | 8 +- .../nlp/functions$$ExplodeAnnotations.html | 8 +- .../nlp/functions$$FilterAnnotations.html | 8 +- .../nlp/functions$$MapAnnotations.html | 8 +- docs/api/com/johnsnowlabs/nlp/functions$.html | 8 +- docs/api/com/johnsnowlabs/nlp/index.html | 8 +- .../nlp/pretrained/PretrainedPipeline$.html | 8 +- .../nlp/pretrained/PretrainedPipeline.html | 8 +- .../pretrained/PythonResourceDownloader$.html | 8 +- .../nlp/pretrained/RepositoryMetadata.html | 8 +- .../nlp/pretrained/ResourceDownloader$.html | 8 +- .../nlp/pretrained/ResourceDownloader.html | 8 +- .../nlp/pretrained/ResourceMetadata$.html | 8 +- .../nlp/pretrained/ResourceMetadata.html | 8 +- .../nlp/pretrained/ResourceRequest.html | 8 +- .../nlp/pretrained/ResourceType$.html | 8 +- .../nlp/pretrained/S3ResourceDownloader.html | 8 +- .../johnsnowlabs/nlp/pretrained/index.html | 8 +- .../com/johnsnowlabs/nlp/recursive/index.html | 8 +- .../nlp/recursive/package$$Recursive.html | 8 +- .../recursive/package$$RecursiveModel.html | 8 +- .../nlp/serialization/ArrayFeature.html | 8 +- .../nlp/serialization/Feature.html | 8 +- .../nlp/serialization/MapFeature.html | 8 +- .../SerializedExternalResource.html | 8 +- .../nlp/serialization/SetFeature.html | 8 +- .../nlp/serialization/StructFeature.html | 8 +- .../nlp/serialization/TransducerFeature.html | 8 +- .../johnsnowlabs/nlp/serialization/index.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLL.html | 8 +- .../nlp/training/CoNLL2003NerReader.html | 8 +- .../nlp/training/CoNLLDocument.html | 8 +- .../CoNLLHelper$$CoNLLSentenceCols.html | 8 +- .../training/CoNLLHelper$$CoNLLTokenCols.html | 8 +- .../nlp/training/CoNLLHelper$.html | 8 +- .../com/johnsnowlabs/nlp/training/CoNLLU.html | 8 +- .../nlp/training/CoNLLUCols$.html | 8 +- .../nlp/training/CoNLLUDocument.html | 8 +- .../com/johnsnowlabs/nlp/training/POS.html | 8 +- .../johnsnowlabs/nlp/training/PubTator.html | 8 +- .../nlp/training/SpacyToAnnotation.html | 8 +- .../com/johnsnowlabs/nlp/training/index.html | 8 +- .../johnsnowlabs/nlp/util/FinisherUtil$.html | 12 +- .../johnsnowlabs/nlp/util/GraphBuilder.html | 12 +- .../nlp/util/LfuCache$CachedItem.html | 8 +- .../nlp/util/LfuCache$DoubleLinked.html | 8 +- .../nlp/util/LfuCache$FrequencyList.html | 8 +- .../com/johnsnowlabs/nlp/util/LfuCache.html | 12 +- .../nlp/util/LruMap$KeyPriority.html | 8 +- .../nlp/util/LruMap$KeyPriorityOrdering$.html | 8 +- .../api/com/johnsnowlabs/nlp/util/LruMap.html | 12 +- ...pConfigKeys$.html => SparkNlpConfig$.html} | 88 +- docs/api/com/johnsnowlabs/nlp/util/index.html | 20 +- .../nlp/util/io/CloudStorageType$.html | 8 +- .../nlp/util/io/ExternalResource$.html | 8 +- .../nlp/util/io/ExternalResource.html | 8 +- .../nlp/util/io/MatchStrategy$.html | 8 +- .../nlp/util/io/OutputHelper$.html | 8 +- .../com/johnsnowlabs/nlp/util/io/ReadAs$.html | 8 +- .../util/io/ResourceHelper$$SourceStream.html | 8 +- .../nlp/util/io/ResourceHelper$.html | 8 +- .../com/johnsnowlabs/nlp/util/io/index.html | 8 +- .../nlp/util/regex/RegexRule.html | 8 +- .../util/regex/RuleFactory$$RuleMatch.html | 8 +- .../nlp/util/regex/RuleFactory$.html | 8 +- .../nlp/util/regex/RuleFactory.html | 8 +- .../nlp/util/regex/TransformStrategy$.html | 8 +- .../johnsnowlabs/nlp/util/regex/index.html | 8 +- .../com/johnsnowlabs/storage/BytesKey.html | 8 +- .../com/johnsnowlabs/storage/Database$.html | 8 +- .../com/johnsnowlabs/storage/Database.html | 8 +- .../johnsnowlabs/storage/HasConnection.html | 8 +- .../com/johnsnowlabs/storage/HasStorage.html | 8 +- .../johnsnowlabs/storage/HasStorageModel.html | 8 +- .../storage/HasStorageOptions.html | 8 +- .../storage/HasStorageReader.html | 8 +- .../johnsnowlabs/storage/HasStorageRef$.html | 8 +- .../johnsnowlabs/storage/HasStorageRef.html | 10 +- .../storage/RocksDBConnection$.html | 8 +- .../storage/RocksDBConnection.html | 8 +- .../storage/StorageBatchWriter.html | 8 +- .../johnsnowlabs/storage/StorageFormat.html | 8 +- .../johnsnowlabs/storage/StorageHelper$.html | 8 +- .../johnsnowlabs/storage/StorageLocator$.html | 8 +- .../johnsnowlabs/storage/StorageLocator.html | 8 +- .../storage/StorageReadWriter.html | 8 +- .../johnsnowlabs/storage/StorageReadable.html | 8 +- .../johnsnowlabs/storage/StorageReader.html | 8 +- .../johnsnowlabs/storage/StorageWriter.html | 8 +- docs/api/com/johnsnowlabs/storage/index.html | 8 +- .../api/com/johnsnowlabs/util/Benchmark$.html | 8 +- docs/api/com/johnsnowlabs/util/Build$.html | 8 +- .../johnsnowlabs/util/CoNLLGenerator$.html | 8 +- .../com/johnsnowlabs/util/ConfigHelper$.html | 8 +- .../com/johnsnowlabs/util/ConfigLoader$.html | 8 +- .../com/johnsnowlabs/util/FileHelper$.html | 8 +- .../com/johnsnowlabs/util/JsonBuilder$.html | 8 +- .../com/johnsnowlabs/util/JsonParser$.html | 8 +- .../johnsnowlabs/util/PipelineModels$.html | 8 +- .../johnsnowlabs/util/TrainingHelper$.html | 8 +- docs/api/com/johnsnowlabs/util/Version$.html | 8 +- docs/api/com/johnsnowlabs/util/Version.html | 8 +- .../johnsnowlabs/util/ZipArchiveUtil$.html | 8 +- docs/api/com/johnsnowlabs/util/index.html | 8 +- .../util/spark/LongMapAccumulator.html | 8 +- .../util/spark/MapAccumulator.html | 8 +- .../johnsnowlabs/util/spark/SparkUtil$.html | 8 +- .../com/johnsnowlabs/util/spark/index.html | 8 +- docs/api/index.html | 8 +- docs/api/index.js | 2 +- docs/api/python/.buildinfo | 2 +- docs/api/python/genindex.html | 19 +- docs/api/python/getting_started/index.html | 20 +- docs/api/python/index.html | 2 +- docs/api/python/modules/index.html | 3 +- docs/api/python/modules/sparknlp.html | 6 +- .../python/modules/sparknlp/annotation.html | 2 +- .../modules/sparknlp/annotation_audio.html | 2 +- .../modules/sparknlp/annotation_image.html | 2 +- .../annotator/audio/hubert_for_ctc.html | 2 +- .../annotator/audio/wav2vec2_for_ctc.html | 2 +- .../annotator/audio/whisper_for_ctc.html | 2 +- .../sparknlp/annotator/chunk2_doc.html | 2 +- .../modules/sparknlp/annotator/chunker.html | 2 +- .../albert_for_question_answering.html | 2 +- .../albert_for_sequence_classification.html | 2 +- .../albert_for_token_classification.html | 2 +- .../bart_for_zero_shot_classification.html | 2 +- .../bert_for_question_answering.html | 2 +- .../bert_for_sequence_classification.html | 2 +- .../bert_for_token_classification.html | 2 +- .../bert_for_zero_shot_classification.html | 2 +- .../camembert_for_question_answering.html | 2 +- ...camembert_for_sequence_classification.html | 2 +- .../camembert_for_token_classification.html | 2 +- .../classifier_dl/classifier_dl.html | 2 +- .../deberta_for_question_answering.html | 2 +- .../deberta_for_sequence_classification.html | 2 +- .../deberta_for_token_classification.html | 2 +- .../distil_bert_for_question_answering.html | 2 +- ...stil_bert_for_sequence_classification.html | 2 +- .../distil_bert_for_token_classification.html | 2 +- ...til_bert_for_zero_shot_classification.html | 2 +- .../longformer_for_question_answering.html | 2 +- ...ongformer_for_sequence_classification.html | 2 +- .../longformer_for_token_classification.html | 2 +- .../classifier_dl/multi_classifier_dl.html | 2 +- .../roberta_for_question_answering.html | 2 +- .../roberta_for_sequence_classification.html | 2 +- .../roberta_for_token_classification.html | 2 +- .../roberta_for_zero_shot_classification.html | 2 +- .../annotator/classifier_dl/sentiment_dl.html | 2 +- .../tapas_for_question_answering.html | 2 +- .../xlm_roberta_for_question_answering.html | 2 +- ...m_roberta_for_sequence_classification.html | 2 +- .../xlm_roberta_for_token_classification.html | 2 +- ..._roberta_for_zero_shot_classification.html | 2 +- .../xlnet_for_sequence_classification.html | 2 +- .../xlnet_for_token_classification.html | 2 +- .../annotator/coref/spanbert_coref.html | 2 +- .../cv/clip_for_zero_shot_classification.html | 2 +- .../cv/convnext_for_image_classification.html | 2 +- .../cv/swin_for_image_classification.html | 2 +- ..._encoder_decoder_for_image_captioning.html | 2 +- .../cv/vit_for_image_classification.html | 2 +- .../sparknlp/annotator/date2_chunk.html | 2 +- .../dependency/dependency_parser.html | 2 +- .../dependency/typed_dependency_parser.html | 2 +- .../document_character_text_splitter.html | 2 +- .../annotator/document_normalizer.html | 2 +- .../annotator/document_token_splitter.html | 2 +- .../document_token_splitter_test.html | 2 +- .../embeddings/albert_embeddings.html | 2 +- .../annotator/embeddings/bert_embeddings.html | 2 +- .../embeddings/bert_sentence_embeddings.html | 2 +- .../annotator/embeddings/bge_embeddings.html | 608 +++ .../embeddings/camembert_embeddings.html | 2 +- .../embeddings/chunk_embeddings.html | 2 +- .../embeddings/deberta_embeddings.html | 2 +- .../embeddings/distil_bert_embeddings.html | 2 +- .../annotator/embeddings/doc2vec.html | 2 +- .../annotator/embeddings/e5_embeddings.html | 2 +- .../annotator/embeddings/elmo_embeddings.html | 2 +- .../embeddings/instructor_embeddings.html | 2 +- .../embeddings/longformer_embeddings.html | 2 +- .../embeddings/mpnet_embeddings.html | 2 +- .../embeddings/roberta_embeddings.html | 2 +- .../roberta_sentence_embeddings.html | 2 +- .../embeddings/sentence_embeddings.html | 2 +- .../universal_sentence_encoder.html | 2 +- .../annotator/embeddings/word2vec.html | 2 +- .../annotator/embeddings/word_embeddings.html | 2 +- .../embeddings/xlm_roberta_embeddings.html | 2 +- .../xlm_roberta_sentence_embeddings.html | 2 +- .../embeddings/xlnet_embeddings.html | 2 +- .../sparknlp/annotator/er/entity_ruler.html | 2 +- .../sparknlp/annotator/graph_extraction.html | 2 +- .../yake_keyword_extraction.html | 2 +- .../annotator/ld_dl/language_detector_dl.html | 2 +- .../sparknlp/annotator/lemmatizer.html | 2 +- .../annotator/matcher/big_text_matcher.html | 2 +- .../annotator/matcher/date_matcher.html | 2 +- .../annotator/matcher/multi_date_matcher.html | 2 +- .../annotator/matcher/regex_matcher.html | 2 +- .../annotator/matcher/text_matcher.html | 2 +- .../sparknlp/annotator/n_gram_generator.html | 2 +- .../sparknlp/annotator/ner/ner_approach.html | 2 +- .../sparknlp/annotator/ner/ner_converter.html | 2 +- .../sparknlp/annotator/ner/ner_crf.html | 2 +- .../sparknlp/annotator/ner/ner_dl.html | 2 +- .../annotator/ner/ner_overwriter.html | 2 +- .../annotator/ner/zero_shot_ner_model.html | 2 +- .../sparknlp/annotator/normalizer.html | 2 +- .../annotator/openai/openai_completion.html | 2 +- .../annotator/openai/openai_embeddings.html | 2 +- .../annotator/param/classifier_encoder.html | 2 +- .../annotator/param/evaluation_dl_params.html | 2 +- .../sparknlp/annotator/pos/perceptron.html | 2 +- .../annotator/sentence/sentence_detector.html | 2 +- .../sentence/sentence_detector_dl.html | 2 +- .../sentiment/sentiment_detector.html | 2 +- .../annotator/sentiment/vivekn_sentiment.html | 2 +- .../annotator/seq2seq/bart_transformer.html | 2 +- .../annotator/seq2seq/gpt2_transformer.html | 2 +- .../annotator/seq2seq/marian_transformer.html | 2 +- .../annotator/seq2seq/t5_transformer.html | 2 +- .../document_similarity_ranker.html | 2 +- .../spell_check/context_spell_checker.html | 2 +- .../spell_check/norvig_sweeting.html | 2 +- .../spell_check/symmetric_delete.html | 2 +- .../modules/sparknlp/annotator/stemmer.html | 2 +- .../annotator/stop_words_cleaner.html | 2 +- .../annotator/tf_ner_dl_graph_builder.html | 2 +- .../annotator/token/chunk_tokenizer.html | 2 +- .../annotator/token/recursive_tokenizer.html | 2 +- .../annotator/token/regex_tokenizer.html | 2 +- .../sparknlp/annotator/token/tokenizer.html | 2 +- .../sparknlp/annotator/token2_chunk.html | 2 +- .../sparknlp/annotator/ws/word_segmenter.html | 2 +- .../sparknlp/base/audio_assembler.html | 2 +- .../modules/sparknlp/base/doc2_chunk.html | 2 +- .../sparknlp/base/document_assembler.html | 2 +- .../sparknlp/base/embeddings_finisher.html | 2 +- .../modules/sparknlp/base/finisher.html | 2 +- .../modules/sparknlp/base/graph_finisher.html | 2 +- .../sparknlp/base/has_recursive_fit.html | 2 +- .../base/has_recursive_transform.html | 2 +- .../sparknlp/base/image_assembler.html | 2 +- .../modules/sparknlp/base/light_pipeline.html | 2 +- .../base/multi_document_assembler.html | 2 +- .../sparknlp/base/recursive_pipeline.html | 2 +- .../sparknlp/base/table_assembler.html | 2 +- .../sparknlp/base/token_assembler.html | 2 +- .../sparknlp/common/annotator_approach.html | 2 +- .../sparknlp/common/annotator_model.html | 2 +- .../sparknlp/common/annotator_properties.html | 2 +- .../sparknlp/common/match_strategy.html | 2 +- .../modules/sparknlp/common/properties.html | 2 +- .../modules/sparknlp/common/read_as.html | 2 +- .../common/recursive_annotator_approach.html | 2 +- .../python/modules/sparknlp/common/utils.html | 2 +- .../python/modules/sparknlp/functions.html | 2 +- .../sparknlp/internal/annotator_java_ml.html | 2 +- .../internal/annotator_transformer.html | 2 +- .../internal/extended_java_wrapper.html | 2 +- .../internal/params_getters_setters.html | 2 +- .../modules/sparknlp/internal/recursive.html | 2 +- .../modules/sparknlp/logging/comet.html | 2 +- .../pretrained/pretrained_pipeline.html | 2 +- .../pretrained/resource_downloader.html | 2 +- .../modules/sparknlp/training/conll.html | 2 +- .../modules/sparknlp/training/conllu.html | 2 +- .../python/modules/sparknlp/training/pos.html | 2 +- .../modules/sparknlp/training/pub_tator.html | 2 +- .../training/spacy_to_annotation.html | 2 +- docs/api/python/objects.inv | Bin 13593 -> 13638 bytes docs/api/python/py-modindex.html | 7 +- .../sparknlp/annotation/index.html | 2 +- .../sparknlp/annotation_audio/index.html | 2 +- .../sparknlp/annotation_image/index.html | 2 +- .../annotator/audio/hubert_for_ctc/index.html | 2 +- .../sparknlp/annotator/audio/index.html | 2 +- .../audio/wav2vec2_for_ctc/index.html | 2 +- .../audio/whisper_for_ctc/index.html | 2 +- .../sparknlp/annotator/chunk2_doc/index.html | 3 +- .../sparknlp/annotator/chunker/index.html | 3 +- .../albert_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 3 +- .../bert_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../bert_for_token_classification/index.html | 3 +- .../index.html | 3 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../classifier_dl/classifier_dl/index.html | 3 +- .../deberta_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 3 +- .../annotator/classifier_dl/index.html | 3 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../multi_classifier_dl/index.html | 3 +- .../roberta_for_question_answering/index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 3 +- .../classifier_dl/sentiment_dl/index.html | 3 +- .../tapas_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 3 +- .../index.html | 3 +- .../xlnet_for_token_classification/index.html | 3 +- .../sparknlp/annotator/coref/index.html | 2 +- .../annotator/coref/spanbert_coref/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../sparknlp/annotator/cv/index.html | 2 +- .../swin_for_image_classification/index.html | 2 +- .../index.html | 2 +- .../vit_for_image_classification/index.html | 2 +- .../sparknlp/annotator/date2_chunk/index.html | 3 +- .../dependency/dependency_parser/index.html | 3 +- .../sparknlp/annotator/dependency/index.html | 3 +- .../typed_dependency_parser/index.html | 3 +- .../index.html | 3 +- .../annotator/document_normalizer/index.html | 3 +- .../document_token_splitter/index.html | 3 +- .../document_token_splitter_test/index.html | 2 +- .../embeddings/albert_embeddings/index.html | 3 +- .../embeddings/bert_embeddings/index.html | 3 +- .../bert_sentence_embeddings/index.html | 3 +- .../embeddings/bge_embeddings/index.html | 764 ++++ .../camembert_embeddings/index.html | 3 +- .../embeddings/chunk_embeddings/index.html | 3 +- .../embeddings/deberta_embeddings/index.html | 3 +- .../distil_bert_embeddings/index.html | 3 +- .../annotator/embeddings/doc2vec/index.html | 3 +- .../embeddings/e5_embeddings/index.html | 3 +- .../embeddings/elmo_embeddings/index.html | 3 +- .../sparknlp/annotator/embeddings/index.html | 4 +- .../instructor_embeddings/index.html | 3 +- .../longformer_embeddings/index.html | 3 +- .../embeddings/mpnet_embeddings/index.html | 3 +- .../embeddings/roberta_embeddings/index.html | 3 +- .../roberta_sentence_embeddings/index.html | 3 +- .../embeddings/sentence_embeddings/index.html | 3 +- .../universal_sentence_encoder/index.html | 3 +- .../annotator/embeddings/word2vec/index.html | 3 +- .../embeddings/word_embeddings/index.html | 3 +- .../xlm_roberta_embeddings/index.html | 3 +- .../index.html | 3 +- .../embeddings/xlnet_embeddings/index.html | 3 +- .../annotator/er/entity_ruler/index.html | 3 +- .../sparknlp/annotator/er/index.html | 3 +- .../annotator/graph_extraction/index.html | 3 +- .../autosummary/sparknlp/annotator/index.html | 4 +- .../annotator/keyword_extraction/index.html | 3 +- .../yake_keyword_extraction/index.html | 3 +- .../sparknlp/annotator/ld_dl/index.html | 3 +- .../ld_dl/language_detector_dl/index.html | 3 +- .../sparknlp/annotator/lemmatizer/index.html | 3 +- .../matcher/big_text_matcher/index.html | 3 +- .../annotator/matcher/date_matcher/index.html | 3 +- .../sparknlp/annotator/matcher/index.html | 3 +- .../matcher/multi_date_matcher/index.html | 3 +- .../matcher/regex_matcher/index.html | 3 +- .../annotator/matcher/text_matcher/index.html | 3 +- .../annotator/n_gram_generator/index.html | 3 +- .../sparknlp/annotator/ner/index.html | 3 +- .../annotator/ner/ner_approach/index.html | 3 +- .../annotator/ner/ner_converter/index.html | 3 +- .../sparknlp/annotator/ner/ner_crf/index.html | 3 +- .../sparknlp/annotator/ner/ner_dl/index.html | 3 +- .../annotator/ner/ner_overwriter/index.html | 3 +- .../ner/zero_shot_ner_model/index.html | 2 +- .../sparknlp/annotator/normalizer/index.html | 3 +- .../sparknlp/annotator/openai/index.html | 3 +- .../openai/openai_completion/index.html | 3 +- .../openai/openai_embeddings/index.html | 3 +- .../param/classifier_encoder/index.html | 2 +- .../param/evaluation_dl_params/index.html | 2 +- .../sparknlp/annotator/param/index.html | 3 +- .../sparknlp/annotator/pos/index.html | 3 +- .../annotator/pos/perceptron/index.html | 3 +- .../sparknlp/annotator/sentence/index.html | 3 +- .../sentence/sentence_detector/index.html | 3 +- .../sentence/sentence_detector_dl/index.html | 3 +- .../sparknlp/annotator/sentiment/index.html | 3 +- .../sentiment/sentiment_detector/index.html | 3 +- .../sentiment/vivekn_sentiment/index.html | 3 +- .../seq2seq/bart_transformer/index.html | 3 +- .../seq2seq/gpt2_transformer/index.html | 3 +- .../sparknlp/annotator/seq2seq/index.html | 3 +- .../seq2seq/marian_transformer/index.html | 3 +- .../seq2seq/t5_transformer/index.html | 3 +- .../document_similarity_ranker/index.html | 2 +- .../sparknlp/annotator/similarity/index.html | 2 +- .../context_spell_checker/index.html | 3 +- .../sparknlp/annotator/spell_check/index.html | 3 +- .../spell_check/norvig_sweeting/index.html | 3 +- .../spell_check/symmetric_delete/index.html | 3 +- .../sparknlp/annotator/stemmer/index.html | 3 +- .../annotator/stop_words_cleaner/index.html | 3 +- .../tf_ner_dl_graph_builder/index.html | 2 +- .../token/chunk_tokenizer/index.html | 3 +- .../sparknlp/annotator/token/index.html | 3 +- .../token/recursive_tokenizer/index.html | 3 +- .../token/regex_tokenizer/index.html | 3 +- .../annotator/token/tokenizer/index.html | 3 +- .../annotator/token2_chunk/index.html | 3 +- .../sparknlp/annotator/ws/index.html | 3 +- .../annotator/ws/word_segmenter/index.html | 3 +- .../sparknlp/base/audio_assembler/index.html | 2 +- .../sparknlp/base/doc2_chunk/index.html | 2 +- .../base/document_assembler/index.html | 2 +- .../base/embeddings_finisher/index.html | 2 +- .../sparknlp/base/finisher/index.html | 2 +- .../sparknlp/base/graph_finisher/index.html | 2 +- .../base/has_recursive_fit/index.html | 2 +- .../base/has_recursive_transform/index.html | 2 +- .../sparknlp/base/image_assembler/index.html | 2 +- .../autosummary/sparknlp/base/index.html | 2 +- .../sparknlp/base/light_pipeline/index.html | 2 +- .../base/multi_document_assembler/index.html | 2 +- .../base/recursive_pipeline/index.html | 2 +- .../sparknlp/base/table_assembler/index.html | 2 +- .../sparknlp/base/token_assembler/index.html | 2 +- .../common/annotator_approach/index.html | 2 +- .../common/annotator_model/index.html | 2 +- .../common/annotator_properties/index.html | 2 +- .../sparknlp/common/annotator_type/index.html | 2 +- .../common/coverage_result/index.html | 2 +- .../autosummary/sparknlp/common/index.html | 2 +- .../sparknlp/common/match_strategy/index.html | 2 +- .../sparknlp/common/properties/index.html | 2 +- .../sparknlp/common/read_as/index.html | 2 +- .../recursive_annotator_approach/index.html | 2 +- .../sparknlp/common/storage/index.html | 2 +- .../sparknlp/common/utils/index.html | 2 +- .../autosummary/sparknlp/functions/index.html | 2 +- .../reference/autosummary/sparknlp/index.html | 2 +- .../internal/annotator_java_ml/index.html | 2 +- .../internal/annotator_transformer/index.html | 2 +- .../internal/extended_java_wrapper/index.html | 2 +- .../autosummary/sparknlp/internal/index.html | 2 +- .../params_getters_setters/index.html | 2 +- .../sparknlp/internal/recursive/index.html | 2 +- .../sparknlp/logging/comet/index.html | 2 +- .../autosummary/sparknlp/logging/index.html | 2 +- .../sparknlp/pretrained/index.html | 2 +- .../pretrained/pretrained_pipeline/index.html | 2 +- .../pretrained/resource_downloader/index.html | 2 +- .../sparknlp/pretrained/utils/index.html | 2 +- .../sparknlp/training/conll/index.html | 2 +- .../sparknlp/training/conllu/index.html | 2 +- .../autosummary/sparknlp/training/index.html | 2 +- .../sparknlp/training/pos/index.html | 2 +- .../sparknlp/training/pub_tator/index.html | 2 +- .../training/spacy_to_annotation/index.html | 2 +- .../sparknlp/training/tfgraphs/index.html | 2 +- .../sparknlp/upload_to_hub/index.html | 2 +- .../autosummary/sparknlp/util/index.html | 2 +- docs/api/python/reference/index.html | 2 +- docs/api/python/search.html | 2 +- docs/api/python/searchindex.js | 2 +- .../python/static/documentation_options.js | 2 +- docs/api/python/third_party/Comet.html | 2 +- docs/api/python/third_party/MLflow.html | 2 +- docs/api/python/third_party/index.html | 2 +- docs/api/python/user_guide/annotation.html | 2 +- docs/api/python/user_guide/annotators.html | 2 +- .../python/user_guide/custom_pipelines.html | 2 +- docs/api/python/user_guide/helpers.html | 2 +- docs/api/python/user_guide/index.html | 2 +- .../python/user_guide/light_pipelines.html | 2 +- .../user_guide/pretrained_pipelines.html | 2 +- docs/api/python/user_guide/training.html | 2 +- docs/api/scala/collection/compat/index.html | 8 +- docs/api/scala/collection/index.html | 8 +- docs/api/scala/index.html | 8 +- .../ml/ai/DeBertaClassification.scala | 5 +- .../com/johnsnowlabs/ml/util/LinAlg.scala | 189 +- .../dl/DeBertaForQuestionAnswering.scala | 12 +- .../dl/DeBertaForSequenceClassification.scala | 5 +- .../dl/DeBertaForTokenClassification.scala | 21 +- .../nlp/embeddings/BGEEmbeddings.scala | 26 +- .../com/johnsnowlabs/ml/util/LinAlgTest.scala | 16 +- .../nlp/embeddings/E5EmbeddingsTestSpec.scala | 21 +- .../embeddings/MPNetEmbeddingsTestSpec.scala | 21 +- 1478 files changed, 17489 insertions(+), 5136 deletions(-) create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/DocumentCharacterTextSplitter$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/annotators/DocumentTokenSplitter$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings$.html create mode 100644 docs/api/com/johnsnowlabs/nlp/embeddings/BGEEmbeddings.html create mode 100644 docs/api/com/johnsnowlabs/nlp/embeddings/ReadBGEDLModel.html create mode 100644 docs/api/com/johnsnowlabs/nlp/embeddings/ReadablePretrainedBGEModel.html rename docs/api/com/johnsnowlabs/nlp/util/{SparkNlpConfigKeys$.html => SparkNlpConfig$.html} (91%) create mode 100644 docs/api/python/modules/sparknlp/annotator/embeddings/bge_embeddings.html create mode 100644 docs/api/python/reference/autosummary/sparknlp/annotator/embeddings/bge_embeddings/index.html diff --git a/docs/api/com/index.html b/docs/api/com/index.html index fd4f44dbccf2d9..d58a0431a374d6 100644 --- a/docs/api/com/index.html +++ b/docs/api/com/index.html @@ -3,9 +3,9 @@ - Spark NLP 5.2.0 ScalaDoc - com - - + Spark NLP 5.2.1 ScalaDoc - com + + @@ -28,7 +28,7 @@