From 7f78be36e135e20dd46c27af32834b4c25cd718f Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Thu, 7 Sep 2023 18:17:03 +0200 Subject: [PATCH 01/12] [SPARKNLP-906] Fix reading suffix (#13945) --- .../scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala | 2 +- .../johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala index e67bbb3a6211fb..85578509a90869 100644 --- a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala +++ b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala @@ -126,7 +126,7 @@ trait ReadOnnxModel { val wrappers = (modelNames map { modelName: String => // 2. Copy to local dir - val localModelFile = modelName + suffix + val localModelFile = modelName fs.copyToLocalFile(new Path(path, localModelFile), new Path(tmpFolder)) val localPath = new Path(tmpFolder, localModelFile).toString diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala index bc1ae6e323c437..0d37be98bcb1d7 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala @@ -32,12 +32,12 @@ class WhisperForCTCTest extends AnyFlatSpec with WhisperForCTCBehaviors { // Needs to be added manually lazy val modelTf: WhisperForCTC = WhisperForCTC - .loadSavedModel("exported_tf/openai/whisper-tiny", ResourceHelper.spark) + .pretrained("asr_whisper_tiny") .setInputCols("audio_assembler") .setOutputCol("document") lazy val modelOnnx: WhisperForCTC = WhisperForCTC - .loadSavedModel("exported_onnx/openai/whisper-tiny", ResourceHelper.spark) + .pretrained() .setInputCols("audio_assembler") .setOutputCol("document") From 96094c3951b3b65f6aba10fa2e436099d9b8e47b Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 7 Sep 2023 21:19:04 +0500 Subject: [PATCH 02/12] Sparknlp 888 Add ONNX support to MPNet embeddings (#13955) * adding onxx support to mpnet * remove name in test * updating default name for mpnet models in scala and python * updating default model name --- .../scala/com/johnsnowlabs/ml/ai/MPNet.scala | 73 +++++++++++++++++-- .../nlp/embeddings/MPNetEmbeddings.scala | 71 +++++++++++++----- .../embeddings/MPNetEmbeddingsTestSpec.scala | 7 +- 3 files changed, 123 insertions(+), 28 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala b/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala index 52025c9385934f..3efa6b1acaa92b 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/MPNet.scala @@ -16,16 +16,19 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import scala.collection.JavaConverters._ /** MPNET Sentence embeddings model * - * @param tensorflow + * @param tensorflowWrapper * tensorflow wrapper * @param configProtoBytes * config proto bytes @@ -37,7 +40,8 @@ import scala.collection.JavaConverters._ * signatures */ private[johnsnowlabs] class MPNet( - val tensorflow: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], configProtoBytes: Option[Array[Byte]] = None, sentenceStartTokenId: Int, sentenceEndTokenId: Int, @@ -47,8 +51,11 @@ private[johnsnowlabs] class MPNet( private val _tfInstructorSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) private val paddingTokenId = 1 - private val bosTokenId = 0 - private val eosTokenId = 2 + + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name /** Get sentence embeddings for a batch of sentences * @param batch @@ -57,6 +64,22 @@ private[johnsnowlabs] class MPNet( * sentence embeddings */ private def getSentenceEmbedding(batch: Seq[Array[Int]]): Array[Array[Float]] = { + val embeddings = detectedEngine match { + case ONNX.name => + getSentenceEmbeddingFromOnnx(batch) + case _ => + getSentenceEmbeddingFromTF(batch) + } + embeddings + } + + /** Get sentence embeddings for a batch of sentences + * @param batch + * batch of sentences + * @return + * sentence embeddings + */ + private def getSentenceEmbeddingFromTF(batch: Seq[Array[Int]]): Array[Array[Float]] = { // get max sentence length val sequencesLength = batch.map(x => x.length).toArray val maxSentenceLength = sequencesLength.max @@ -92,7 +115,7 @@ private[johnsnowlabs] class MPNet( tensorEncoder.createIntBufferTensor(shape, encoderAttentionMaskBuffers) // run model - val runner = tensorflow + val runner = tensorflowWrapper.get .getTFSessionWithSignature( configProtoBytes = configProtoBytes, initAllTables = false, @@ -131,6 +154,46 @@ private[johnsnowlabs] class MPNet( sentenceEmbeddingsFloatsArray } + private def getSentenceEmbeddingFromOnnx(batch: Seq[Array[Int]]): Array[Array[Float]] = { + val batchLength = batch.length + val maxSentenceLength = batch.map(pieceIds => pieceIds.length).max + + val (runner, env) = onnxWrapper.get.getSession() + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + // TODO: A try without a catch or finally is equivalent to putting its body in a block; no exceptions are handled. + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("last_hidden_state") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + + val dim = embeddings.length / batchLength + // group embeddings + val sentenceEmbeddingsFloatsArray = embeddings.grouped(dim).toArray + sentenceEmbeddingsFloatsArray + } finally if (results != null) results.close() + } + } + /** Predict sentence embeddings for a batch of sentences * @param sentences * sentences diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala index 63913bd434f57b..80e29d4c15dadf 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddings.scala @@ -17,13 +17,14 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.ml.ai.MPNet +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.util.LoadExternalModel.{ loadTextAsset, modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} @@ -145,6 +146,7 @@ class MPNetEmbeddings(override val uid: String) extends AnnotatorModel[MPNetEmbeddings] with HasBatchedAnnotate[MPNetEmbeddings] with WriteTensorflowModel + with WriteOnnxModel with HasEmbeddingsProperties with HasStorageRef with HasCaseSensitiveProperties @@ -229,12 +231,14 @@ class MPNetEmbeddings(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper): MPNetEmbeddings = { + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper]): MPNetEmbeddings = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new MPNet( tensorflowWrapper, + onnxWrapper, configProtoBytes = getConfigProtoBytes, sentenceStartTokenId = sentenceStartTokenId, sentenceEndTokenId = sentenceEndTokenId, @@ -336,14 +340,29 @@ class MPNetEmbeddings(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflow, - "_mpnet", - MPNetEmbeddings.tfFile, - configProtoBytes = getConfigProtoBytes, - savedSignatures = getSignatures) + val suffix = "_mpnet" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + MPNetEmbeddings.tfFile, + configProtoBytes = getConfigProtoBytes, + savedSignatures = getSignatures) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + MPNetEmbeddings.onnxFile) + + case _ => + throw new Exception(notSupportedEngineError) + } } /** @group getParam */ @@ -366,7 +385,7 @@ class MPNetEmbeddings(override val uid: String) trait ReadablePretrainedMPNetModel extends ParamsAndFeaturesReadable[MPNetEmbeddings] with HasPretrained[MPNetEmbeddings] { - override val defaultModelName: Some[String] = Some("mpnet_small") + override val defaultModelName: Some[String] = Some("all_mpnet_base_v2") /** Java compliant-overrides */ override def pretrained(): MPNetEmbeddings = super.pretrained() @@ -380,19 +399,26 @@ trait ReadablePretrainedMPNetModel super.pretrained(name, lang, remoteLoc) } -trait ReadMPNetDLModel extends ReadTensorflowModel { +trait ReadMPNetDLModel extends ReadTensorflowModel with ReadOnnxModel { this: ParamsAndFeaturesReadable[MPNetEmbeddings] => override val tfFile: String = "mpnet_tensorflow" + override val onnxFile: String = "mpnet_onnx" def readModel(instance: MPNetEmbeddings, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel( - path, - spark, - "_mpnet_tf", - savedSignatures = instance.getSignatures, - initAllTables = false) - instance.setModelIfNotSet(spark, tf) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = readTensorflowModel(path, spark, "_mpnet_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None) + + case ONNX.name => + val onnxWrapper = + readOnnxModel(path, spark, "_mpnet_onnx", zipped = true, useBundle = false, None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper)) + + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -424,7 +450,12 @@ trait ReadMPNetDLModel extends ReadTensorflowModel { */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper) + .setModelIfNotSet(spark, Some(wrapper), None) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper)) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala index 5a7bbfc14b9419..4c0a9cca0b4c91 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/MPNetEmbeddingsTestSpec.scala @@ -24,7 +24,7 @@ import org.scalatest.flatspec.AnyFlatSpec class MPNetEmbeddingsTestSpec extends AnyFlatSpec { - "E5 Embeddings" should "correctly embed multiple sentences" taggedAs SlowTest in { + "Mpnet Embeddings" should "correctly embed multiple sentences" taggedAs SlowTest in { import ResourceHelper.spark.implicits._ @@ -38,12 +38,13 @@ class MPNetEmbeddingsTestSpec extends AnyFlatSpec { val embeddings = MPNetEmbeddings .pretrained() .setInputCols(Array("document")) - .setOutputCol("e5") + .setOutputCol("mpnet") val pipeline = new Pipeline().setStages(Array(document, embeddings)) val pipelineDF = pipeline.fit(ddd).transform(ddd) - pipelineDF.select("e5.embeddings").show(truncate = false) + pipelineDF.select("mpnet.embeddings").show(truncate = false) } + } From fae23448d21cd929cfc607eba2a44b4aba758728 Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:29:35 -0500 Subject: [PATCH 03/12] Adding ONNX Support to ALBERT Token and Sequence Classification and Question Answering annotators (#13956) * SPARKNLP-891 Adding ONNX support for AlbertQuestionAnswering SPARKNLP-892 Adding ONNX support for AlbertSequenceClassification SPARKNLP-893 Adding ONNX support for AlbertTokenClassification * SPARKNLP-891 Adding ONNX support for AlbertQuestionAnswering SPARKNLP-892 Adding ONNX support for AlbertSequenceClassification SPARKNLP-893 Adding ONNX support for AlbertTokenClassification --- .../ml/ai/AlbertClassification.scala | 234 ++++++++++++------ .../ml/ai/XXXForClassification.scala | 10 +- .../ml/ai/ZeroShotNerClassification.scala | 3 +- .../dl/AlbertForQuestionAnswering.scala | 72 ++++-- .../dl/AlbertForSequenceClassification.scala | 72 ++++-- .../dl/AlbertForTokenClassification.scala | 71 ++++-- 6 files changed, 338 insertions(+), 124 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala index aa6b561b0f34f7..d66e299015ccdb 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala @@ -16,9 +16,13 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.LoadExternalModel.notSupportedEngineError +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} import org.tensorflow.ndarray.buffer.IntDataBuffer @@ -37,7 +41,8 @@ import scala.collection.JavaConverters._ * TF v2 signatures in Spark NLP */ private[johnsnowlabs] class AlbertClassification( - val tensorflowWrapper: TensorflowWrapper, + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], val spp: SentencePieceWrapper, configProtoBytes: Option[Array[Byte]] = None, tags: Map[String, Int], @@ -48,6 +53,10 @@ private[johnsnowlabs] class AlbertClassification( val _tfAlbertSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name // keys representing the input and output tensors of the ALBERT model protected val sentencePadTokenId: Int = spp.getSppModel.pieceToId("[pad]") @@ -95,59 +104,13 @@ private[johnsnowlabs] class AlbertClassification( } def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] = { - val tensors = new TensorResources() - - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - - // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) - - batch.zipWithIndex - .foreach { case (sentence, idx) => - val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) - maskBuffers - .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) - segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0)) - } - - val runner = tensorflowWrapper - .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) - .runner - - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) - - runner - .feed( - _tfAlbertSignatures.getOrElse( - ModelSignatureConstants.InputIds.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfAlbertSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfAlbertSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIds.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfAlbertSignatures - .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) - - val outs = runner.run().asScala - val rawScores = TensorResources.extractFloats(outs.head) - - outs.foreach(_.close()) - tensors.clearSession(outs) - tensors.clearTensors() + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch, maxSentenceLength, sequence = true) + case _ => getRawScoresWithTF(batch, maxSentenceLength) + } val dim = rawScores.length / (batchLength * maxSentenceLength) val batchScores: Array[Array[Array[Float]]] = rawScores @@ -161,17 +124,39 @@ private[johnsnowlabs] class AlbertClassification( } def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { + val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch, maxSentenceLength, sequence = true) + case _ => getRawScoresWithTF(batch, maxSentenceLength) + } + + val dim = rawScores.length / batchLength + val batchScores: Array[Array[Float]] = + rawScores + .grouped(dim) + .map(scores => + activation match { + case ActivationFunction.softmax => calculateSoftmax(scores) + case ActivationFunction.sigmoid => calculateSigmoid(scores) + case _ => calculateSoftmax(scores) + }) + .toArray + + batchScores + } + + private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { val tensors = new TensorResources() - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) + val shape = Array(batchLength.toLong, maxSentenceLength) batch.zipWithIndex .foreach { case (sentence, idx) => @@ -183,7 +168,7 @@ private[johnsnowlabs] class AlbertClassification( segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0)) } - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner @@ -215,19 +200,51 @@ private[johnsnowlabs] class AlbertClassification( tensors.clearSession(outs) tensors.clearTensors() - val dim = rawScores.length / batchLength - val batchScores: Array[Array[Float]] = - rawScores - .grouped(dim) - .map(scores => - activation match { - case ActivationFunction.softmax => calculateSoftmax(scores) - case ActivationFunction.sigmoid => calculateSigmoid(scores) - case _ => calculateSoftmax(scores) - }) - .toArray + rawScores + } - batchScores + private def getRowScoresWithOnnx( + batch: Seq[Array[Int]], + maxSentenceLength: Int, + sequence: Boolean): Array[Float] = { + + val output = if (sequence) "logits" else "last_hidden_state" + + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get(output) + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + + embeddings + } finally if (results != null) results.close() + } } def tagZeroShotSequence( @@ -237,10 +254,29 @@ private[johnsnowlabs] class AlbertClassification( activation: String): Array[Array[Float]] = ??? def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { - val tensors = new TensorResources() - + val batchLength = batch.length val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val (startLogits, endLogits) = detectedEngine match { + case ONNX.name => computeLogitsWithOnnx(batch, maxSentenceLength) + case _ => computeLogitsWithTF(batch, maxSentenceLength) + } + + val endDim = endLogits.length / batchLength + val endScores: Array[Array[Float]] = + endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + + val startDim = startLogits.length / batchLength + val startScores: Array[Array[Float]] = + startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + + (startScores, endScores) + } + + private def computeLogitsWithTF( + batch: Seq[Array[Int]], + maxSentenceLength: Int): (Array[Float], Array[Float]) = { val batchLength = batch.length + val tensors = new TensorResources() val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) @@ -271,7 +307,7 @@ private[johnsnowlabs] class AlbertClassification( }) } - val runner = tensorflowWrapper + val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner @@ -306,15 +342,55 @@ private[johnsnowlabs] class AlbertClassification( tensors.clearSession(outs) tensors.clearTensors() - val endDim = endLogits.length / batchLength - val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray - - val startDim = startLogits.length / batchLength - val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + (endLogits, startLogits) + } - (startScores, endScores) + private def computeLogitsWithOnnx( + batch: Seq[Array[Int]], + maxSentenceLength: Int): (Array[Float], Array[Float]) = { + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession() + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val segmentTensors = + OnnxTensor.createTensor(env, batch.map(x => Array.fill(maxSentenceLength)(0L)).toArray) + + val inputs = + Map( + "input_ids" -> tokenTensors, + "attention_mask" -> maskTensors, + "token_type_ids" -> segmentTensors).asJava + + try { + val output = runner.run(inputs) + try { + val startLogits = output + .get("start_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + val endLogits = output + .get("end_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + tokenTensors.close() + maskTensors.close() + segmentTensors.close() + + (startLogits.slice(1, startLogits.length), endLogits.slice(1, endLogits.length)) + } finally if (output != null) output.close() + } } def findIndexedToken( diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala index b6e0e18863e819..919d6aa0d17c6e 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/XXXForClassification.scala @@ -16,6 +16,7 @@ package com.johnsnowlabs.ml.ai +import com.johnsnowlabs.ml.util.TensorFlow import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.{ActivationFunction, Annotation, AnnotatorType} @@ -244,7 +245,8 @@ private[johnsnowlabs] trait XXXForClassification { documents: Seq[Annotation], maxSentenceLength: Int, caseSensitive: Boolean, - mergeTokenStrategy: String = MergeTokenStrategy.vocab): Seq[Annotation] = { + mergeTokenStrategy: String = MergeTokenStrategy.vocab, + engine: String = TensorFlow.name): Seq[Annotation] = { val questionAnnot = Seq(documents.head) val contextAnnot = documents.drop(1) @@ -264,9 +266,13 @@ private[johnsnowlabs] trait XXXForClassification { val startIndex = startScores.zipWithIndex.maxBy(_._1) val endIndex = endScores.zipWithIndex.maxBy(_._1) + val offsetStartIndex = if (engine == TensorFlow.name) 2 else 1 + val offsetEndIndex = if (engine == TensorFlow.name) 1 else 0 + val allTokenPieces = wordPieceTokenizedQuestion.head.tokens ++ wordPieceTokenizedContext.flatMap(x => x.tokens) - val decodedAnswer = allTokenPieces.slice(startIndex._2 - 2, endIndex._2 - 1) + val decodedAnswer = + allTokenPieces.slice(startIndex._2 - offsetStartIndex, endIndex._2 - offsetEndIndex) val content = mergeTokenStrategy match { case MergeTokenStrategy.vocab => diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/ZeroShotNerClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/ZeroShotNerClassification.scala index 0553b1a94c3028..57a60fe26ea175 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/ZeroShotNerClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/ZeroShotNerClassification.scala @@ -62,7 +62,8 @@ private[johnsnowlabs] class ZeroShotNerClassification( documents: Seq[Annotation], maxSentenceLength: Int, caseSensitive: Boolean, - mergeTokenStrategy: String): Seq[Annotation] = { + mergeTokenStrategy: String, + engine: String): Seq[Annotation] = { val questionAnnot = Seq(documents.head) val contextAnnot = documents.drop(1) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala index 217fbc6ca25947..3c7e1347e70be1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForQuestionAnswering.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.{AlbertClassification, MergeTokenStrategy} +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -28,8 +29,9 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.embeddings.BertEmbeddings import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.param.{IntArrayParam, IntParam} @@ -116,6 +118,7 @@ class AlbertForQuestionAnswering(override val uid: String) extends AnnotatorModel[AlbertForQuestionAnswering] with HasBatchedAnnotate[AlbertForQuestionAnswering] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -196,13 +199,15 @@ class AlbertForQuestionAnswering(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): AlbertForQuestionAnswering = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new AlbertClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = Map.empty[String, Int], @@ -244,7 +249,8 @@ class AlbertForQuestionAnswering(override val uid: String) documents, $(maxSentenceLength), $(caseSensitive), - MergeTokenStrategy.sentencePiece) + MergeTokenStrategy.sentencePiece, + getEngine) } else { Seq.empty[Annotation] } @@ -253,13 +259,26 @@ class AlbertForQuestionAnswering(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_albert_classification", - AlbertForQuestionAnswering.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_albert_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + AlbertForQuestionAnswering.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + AlbertForQuestionAnswering.onnxFile) + } + writeSentencePieceModel( path, spark, @@ -291,17 +310,37 @@ trait ReadablePretrainedAlbertForQAModel trait ReadAlbertForQuestionAnsweringDLModel extends ReadTensorflowModel + with ReadOnnxModel with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[AlbertForQuestionAnswering] => override val tfFile: String = "albert_classification_tensorflow" + override val onnxFile: String = "albert_classification_onnx" override val sppFile: String = "albert_spp" def readModel(instance: AlbertForQuestionAnswering, path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_albert_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tf = + readTensorflowModel(path, spark, "_albert_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tf), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_albert_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } + } addReader(readModel) @@ -318,7 +357,7 @@ trait ReadAlbertForQuestionAnsweringDLModel detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -331,7 +370,12 @@ trait ReadAlbertForQuestionAnsweringDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala index f0d61bcaade650..16b9e6c196e37d 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForSequenceClassification.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.AlbertClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -29,7 +30,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -124,6 +125,7 @@ class AlbertForSequenceClassification(override val uid: String) extends AnnotatorModel[AlbertForSequenceClassification] with HasBatchedAnnotate[AlbertForSequenceClassification] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasClassifierActivationProperties @@ -239,13 +241,15 @@ class AlbertForSequenceClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): AlbertForSequenceClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new AlbertClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -305,13 +309,26 @@ class AlbertForSequenceClassification(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_albert_classification", - AlbertForSequenceClassification.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_albert_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + "_albert_classification", + AlbertForSequenceClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + AlbertForSequenceClassification.onnxFile) + } + writeSentencePieceModel( path, spark, @@ -341,10 +358,14 @@ trait ReadablePretrainedAlbertForSequenceModel super.pretrained(name, lang, remoteLoc) } -trait ReadAlbertForSequenceDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadAlbertForSequenceDLModel + extends ReadTensorflowModel + with ReadOnnxModel + with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[AlbertForSequenceClassification] => override val tfFile: String = "albert_classification_tensorflow" + override val onnxFile: String = "albert_classification_onnx" override val sppFile: String = "albert_spp" def readModel( @@ -352,9 +373,27 @@ trait ReadAlbertForSequenceDLModel extends ReadTensorflowModel with ReadSentence path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_albert_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tf = + readTensorflowModel(path, spark, "_albert_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tf), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_albert_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } + } addReader(readModel) @@ -373,7 +412,7 @@ trait ReadAlbertForSequenceDLModel extends ReadTensorflowModel with ReadSentence detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -386,7 +425,12 @@ trait ReadAlbertForSequenceDLModel extends ReadTensorflowModel with ReadSentence */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala index 89e61223d63097..8f91eb208ffc4b 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/AlbertForTokenClassification.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.AlbertClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -29,7 +30,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ModelEngine, TensorFlow} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -123,6 +124,7 @@ class AlbertForTokenClassification(override val uid: String) extends AnnotatorModel[AlbertForTokenClassification] with HasBatchedAnnotate[AlbertForTokenClassification] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -217,13 +219,15 @@ class AlbertForTokenClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): AlbertForTokenClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new AlbertClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -276,13 +280,26 @@ class AlbertForTokenClassification(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_albert_classification", - AlbertForTokenClassification.tfFile, - configProtoBytes = getConfigProtoBytes) + val suffix = "_albert_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + AlbertForTokenClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + AlbertForTokenClassification.onnxFile) + } + writeSentencePieceModel( path, spark, @@ -312,10 +329,14 @@ trait ReadablePretrainedAlbertForTokenModel remoteLoc: String): AlbertForTokenClassification = super.pretrained(name, lang, remoteLoc) } -trait ReadAlbertForTokenDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadAlbertForTokenDLModel + extends ReadTensorflowModel + with ReadOnnxModel + with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[AlbertForTokenClassification] => override val tfFile: String = "albert_classification_tensorflow" + override val onnxFile: String = "albert_classification_onnx" override val sppFile: String = "albert_spp" def readModel( @@ -323,9 +344,27 @@ trait ReadAlbertForTokenDLModel extends ReadTensorflowModel with ReadSentencePie path: String, spark: SparkSession): Unit = { - val tf = readTensorflowModel(path, spark, "_albert_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_albert_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tf = + readTensorflowModel(path, spark, "_albert_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tf), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "_albert_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } + } addReader(readModel) @@ -344,7 +383,7 @@ trait ReadAlbertForTokenDLModel extends ReadTensorflowModel with ReadSentencePie detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -357,8 +396,12 @@ trait ReadAlbertForTokenDLModel extends ReadTensorflowModel with ReadSentencePie */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) } From a802ab8752cd16f8f2953431639ffd80f37d4abd Mon Sep 17 00:00:00 2001 From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:35:24 -0500 Subject: [PATCH 04/12] SPARKNLP-884 Enabling getVectors method to get word vectors as spark dataframe (#13957) --- .../sparknlp/annotator/embeddings/doc2vec.py | 6 ++ .../sparknlp/annotator/embeddings/word2vec.py | 6 ++ .../nlp/embeddings/Doc2VecModel.scala | 27 ++++++++- .../nlp/embeddings/Word2VecModel.scala | 23 +++++++- .../nlp/embeddings/Doc2VecTestSpec.scala | 59 ++++++++++++------- .../nlp/embeddings/Word2VecTestSpec.scala | 33 +++++++++++ 6 files changed, 130 insertions(+), 24 deletions(-) diff --git a/python/sparknlp/annotator/embeddings/doc2vec.py b/python/sparknlp/annotator/embeddings/doc2vec.py index da63d575996d1e..1bc6c7120b8e77 100755 --- a/python/sparknlp/annotator/embeddings/doc2vec.py +++ b/python/sparknlp/annotator/embeddings/doc2vec.py @@ -344,3 +344,9 @@ def pretrained(name="doc2vec_gigaword_300", lang="en", remote_loc=None): from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(Doc2VecModel, name, lang, remote_loc) + def getVectors(self): + """ + Returns the vector representation of the words as a dataframe + with two fields, word and vector. + """ + return self._call_java("getVectors") diff --git a/python/sparknlp/annotator/embeddings/word2vec.py b/python/sparknlp/annotator/embeddings/word2vec.py index e3e52f32f00c2a..c9c9450f5ffb4e 100755 --- a/python/sparknlp/annotator/embeddings/word2vec.py +++ b/python/sparknlp/annotator/embeddings/word2vec.py @@ -345,3 +345,9 @@ def pretrained(name="word2vec_gigaword_300", lang="en", remote_loc=None): from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(Word2VecModel, name, lang, remote_loc) + def getVectors(self): + """ + Returns the vector representation of the words as a dataframe + with two fields, word and vector. + """ + return self._call_java("getVectors") diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.scala index 6b2d6f86664a50..6524c9a3bd1e0e 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecModel.scala @@ -23,7 +23,8 @@ import com.johnsnowlabs.nlp._ import com.johnsnowlabs.storage.HasStorageRef import org.apache.spark.ml.param.{IntParam, ParamValidators} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.types.{ArrayType, FloatType, StringType, StructField, StructType} /** Word2Vec model that creates vector representations of words in a text corpus. * @@ -166,6 +167,21 @@ class Doc2VecModel(override val uid: String) /** @group setParam */ def setWordVectors(value: Map[String, Array[Float]]): this.type = set(wordVectors, value) + private var sparkSession: Option[SparkSession] = None + + def getVectors: DataFrame = { + val vectors: Map[String, Array[Float]] = $$(wordVectors) + val rows = vectors.toSeq.map { case (key, values) => Row(key, values) } + val schema = StructType( + StructField("word", StringType, nullable = false) :: + StructField("vector", ArrayType(FloatType), nullable = false) :: Nil) + if (sparkSession.isEmpty) { + throw new UnsupportedOperationException( + "Vector representation empty. Please run Doc2VecModel in some pipeline before accessing vector vocabulary.") + } + sparkSession.get.createDataFrame(sparkSession.get.sparkContext.parallelize(rows), schema) + } + setDefault(inputCols -> Array(TOKEN), outputCol -> "doc2vec", vectorSize -> 100) private def calculateSentenceEmbeddings(matrix: Seq[Array[Float]]): Array[Float] = { @@ -180,6 +196,11 @@ class Doc2VecModel(override val uid: String) res } + override def beforeAnnotate(dataset: Dataset[_]): Dataset[_] = { + sparkSession = Some(dataset.sparkSession) + dataset + } + /** takes a document and annotations and produces new annotations of this annotator's annotation * type * @@ -204,8 +225,8 @@ class Doc2VecModel(override val uid: String) .filter(_.nonEmpty) val oovVector = Array.fill($(vectorSize))(0.0f) - val vectors = tokens.map { tokne => - $$(wordVectors).getOrElse(tokne, oovVector) + val vectors = tokens.map { token => + $$(wordVectors).getOrElse(token, oovVector) } val sentEmbeddings = calculateSentenceEmbeddings(vectors) diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/Word2VecModel.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/Word2VecModel.scala index 5ddf760450df4d..67a7388eef419f 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/embeddings/Word2VecModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/Word2VecModel.scala @@ -24,7 +24,8 @@ import com.johnsnowlabs.nlp._ import com.johnsnowlabs.storage.HasStorageRef import org.apache.spark.ml.param.{IntParam, ParamValidators} import org.apache.spark.ml.util.Identifiable -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.{ArrayType, FloatType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} /** Word2Vec model that creates vector representations of words in a text corpus. * @@ -167,8 +168,28 @@ class Word2VecModel(override val uid: String) /** @group setParam */ def setWordVectors(value: Map[String, Array[Float]]): this.type = set(wordVectors, value) + private var sparkSession: Option[SparkSession] = None + + def getVectors: DataFrame = { + val vectors: Map[String, Array[Float]] = $$(wordVectors) + val rows = vectors.toSeq.map { case (key, values) => Row(key, values) } + val schema = StructType( + StructField("word", StringType, nullable = false) :: + StructField("vector", ArrayType(FloatType), nullable = false) :: Nil) + if (sparkSession.isEmpty) { + throw new UnsupportedOperationException( + "Vector representation empty. Please run Word2VecModel in some pipeline before accessing vector vocabulary.") + } + sparkSession.get.createDataFrame(sparkSession.get.sparkContext.parallelize(rows), schema) + } + setDefault(inputCols -> Array(TOKEN), outputCol -> "word2vec", vectorSize -> 100) + override def beforeAnnotate(dataset: Dataset[_]): Dataset[_] = { + sparkSession = Some(dataset.sparkSession) + dataset + } + /** takes a document and annotations and produces new annotations of this annotator's annotation * type * diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecTestSpec.scala index c66ddb882b4216..d9dc979116e7ce 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/Doc2VecTestSpec.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotator._ +import com.johnsnowlabs.nlp.annotators.SparkSessionTest import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.training.CoNLL import com.johnsnowlabs.nlp.util.io.ResourceHelper @@ -27,7 +28,7 @@ import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, Multiclas import org.apache.spark.sql.functions.{explode, when} import org.scalatest.flatspec.AnyFlatSpec -class Doc2VecTestSpec extends AnyFlatSpec { +class Doc2VecTestSpec extends AnyFlatSpec with SparkSessionTest { "Doc2VecApproach" should "train, save, and load back the saved model" taggedAs FastTest in { @@ -43,18 +44,6 @@ class Doc2VecTestSpec extends AnyFlatSpec { " ", " ").toDF("text") - val document = new DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") - - val setence = new SentenceDetector() - .setInputCols("document") - .setOutputCol("sentence") - - val tokenizer = new Tokenizer() - .setInputCols(Array("sentence")) - .setOutputCol("token") - val stops = new StopWordsCleaner() .setInputCols("token") .setOutputCol("cleanedToken") @@ -67,7 +56,7 @@ class Doc2VecTestSpec extends AnyFlatSpec { .setStorageRef("my_awesome_doc2vec") .setEnableCaching(true) - val pipeline = new Pipeline().setStages(Array(document, setence, tokenizer, stops, doc2Vec)) + val pipeline = new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizerWithSentence, stops, doc2Vec)) val pipelineModel = pipeline.fit(ddd) val pipelineDF = pipelineModel.transform(ddd) @@ -87,7 +76,7 @@ class Doc2VecTestSpec extends AnyFlatSpec { .setOutputCol("sentence_embeddings") val loadedPipeline = - new Pipeline().setStages(Array(document, setence, tokenizer, loadedDoc2Vec)) + new Pipeline().setStages(Array(documentAssembler, sentenceDetector, tokenizerWithSentence, loadedDoc2Vec)) loadedPipeline.fit(ddd).transform(ddd).select("sentence_embeddings").show() @@ -105,10 +94,6 @@ class Doc2VecTestSpec extends AnyFlatSpec { "carbon emissions have come down without impinging on our growth .\\u2009.\\u2009.", "the ").toDF("text") - val document = new DocumentAssembler() - .setInputCol("text") - .setOutputCol("document") - val setence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") @@ -135,7 +120,7 @@ class Doc2VecTestSpec extends AnyFlatSpec { val pipeline = new Pipeline().setStages( Array( - document, + documentAssembler, setence, tokenizerDocument, tokenizerSentence, @@ -332,4 +317,38 @@ class Doc2VecTestSpec extends AnyFlatSpec { println("Area under ROC = " + auROC) } + + it should "get word vectors as spark dataframe" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val testDataset = Seq( + "Rare Hendrix song draft sells for almost $17,000. This is my second sentenece! The third one here!") + .toDF("text") + + val doc2Vec = Doc2VecModel + .pretrained() + .setInputCols("token") + .setOutputCol("embeddings") + + val pipeline = + new Pipeline().setStages(Array(documentAssembler, tokenizer, doc2Vec)) + + val result = pipeline.fit(testDataset).transform(testDataset) + result.show() + + doc2Vec.getVectors.show() + } + + it should "raise an error when trying to retrieve empty word vectors" taggedAs SlowTest in { + val word2Vec = Doc2VecModel + .pretrained() + .setInputCols("token") + .setOutputCol("embeddings") + + intercept[UnsupportedOperationException] { + word2Vec.getVectors + } + } + } diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/Word2VecTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/Word2VecTestSpec.scala index 8f51f440cd29d8..3c27a767b1da88 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/Word2VecTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/Word2VecTestSpec.scala @@ -201,4 +201,37 @@ class Word2VecTestSpec extends AnyFlatSpec with SparkSessionTest { } + it should "get word vectors as spark dataframe" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val testDataset = Seq( + "Rare Hendrix song draft sells for almost $17,000. This is my second sentenece! The third one here!") + .toDF("text") + + val word2Vec = Word2VecModel + .pretrained() + .setInputCols("token") + .setOutputCol("embeddings") + + val pipeline = + new Pipeline().setStages(Array(documentAssembler, tokenizer, word2Vec)) + + val result = pipeline.fit(testDataset).transform(testDataset) + result.show() + + word2Vec.getVectors.show() + } + + it should "raise an error when trying to retrieve empty word vectors" taggedAs SlowTest in { + val word2Vec = Word2VecModel + .pretrained() + .setInputCols("token") + .setOutputCol("embeddings") + + intercept[UnsupportedOperationException] { + word2Vec.getVectors + } + } + } From 637f007c37833f9e648b8a13193836bf8d97808c Mon Sep 17 00:00:00 2001 From: Devin Ha <33089471+DevinTDHa@users.noreply.github.com> Date: Thu, 7 Sep 2023 18:36:09 +0200 Subject: [PATCH 05/12] [SPARKNLP-890] ONNX E5 MPnet example (#13958) --- .../HuggingFace_ONNX_in_Spark_NLP_E5.ipynb | 388 +++ .../HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb | 2533 +++++++++++++++++ 2 files changed, 2921 insertions(+) create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb new file mode 100644 index 00000000000000..a0f8755c0a48f2 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb @@ -0,0 +1,388 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb)\n", + "\n", + "# Import ONNX E5 models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support for this annotator was introduced in `Spark NLP 5.1.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for E5 from HuggingFace and they have to be in `Sentence Similarity` category. Meaning, you cannot use E5 models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.6/380.6 kB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m40.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m46.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m37.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export to ONNX.\n", + "Using framework PyTorch: 2.0.1+cu118\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n", + "verbose: False, log level: Level.ERROR\n", + "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n", + "\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForFeatureExtraction\n", + "\n", + "MODEL_NAME = \"intfloat/e5-small-v2\"\n", + "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets\n", + "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look inside these two directories and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 130692\n", + "drwxr-xr-x 2 root root 4096 Sep 5 09:03 assets\n", + "-rw-r--r-- 1 root root 626 Sep 5 09:03 config.json\n", + "-rw-r--r-- 1 root root 133093467 Sep 5 09:03 model.onnx\n", + "-rw-r--r-- 1 root root 125 Sep 5 09:03 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 314 Sep 5 09:03 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 711396 Sep 5 09:03 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 228\n", + "-rw-r--r-- 1 root root 231508 Sep 5 09:03 vocab.txt\n" + ] + } + ], + "source": [ + "!ls -l {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save E5 in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m531.2/531.2 kB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use `loadSavedModel` functon in `E5Embeddings` which allows us to load the ONNX model\n", + "- Most params will be set automatically. They can also be set later after loading the model in `E5Embeddings` during runtime, so don't worry about setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "E5 = E5Embeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"E5\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "E5.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your ONNX E5 model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 130008\n", + "-rw-r--r-- 1 root root 133113905 Sep 5 08:57 e5_onnx\n", + "drwxr-xr-x 3 root root 4096 Sep 5 08:57 fields\n", + "drwxr-xr-x 2 root root 4096 Sep 5 08:57 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny E5 model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "E5_loaded = E5Embeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n", + " .setInputCols([\"document\"])\\\n", + " .setOutputCol(\"E5\")\\\n", + "\n", + "pipeline = Pipeline(\n", + " stages = [\n", + " document_assembler,\n", + " E5_loaded\n", + " ])\n", + "\n", + "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n", + "model = pipeline.fit(data)\n", + "result = model.transform(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n", + "| embeddings|\n", + "+--------------------+\n", + "|[-0.35357836, 0.3...|\n", + "+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "result.selectExpr(\"explode(E5.embeddings) as embeddings\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of E5 models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb new file mode 100644 index 00000000000000..ee8377171143b6 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb @@ -0,0 +1,2533 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_MPNet.ipynb)\n", + "\n", + "# Import ONNX MPNet models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support for this annotator was introduced in `Spark NLP 5.1.1`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for MPNet from HuggingFace and they have to be in `Sentence Similarity` category. Meaning, you cannot use MPNet models trained/fine-tuned on a specific task such as token/sequence classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m19.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.6/380.6 kB\u001b[0m \u001b[31m33.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m27.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m54.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m44.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m79.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m44.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m86.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m73.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m71.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m18.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "aaa606f9a3ff42a79f352eab50bafe2e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/571 [00:00 Date: Thu, 7 Sep 2023 18:47:30 +0200 Subject: [PATCH 06/12] Bump version to 5.1.1 --- README.md | 88 +++++++++---------- build.sbt | 2 +- docs/_layouts/landing.html | 2 +- docs/en/concepts.md | 2 +- docs/en/examples.md | 4 +- docs/en/hardware_acceleration.md | 2 +- docs/en/install.md | 54 ++++++------ docs/en/spark_nlp.md | 2 +- python/README.md | 88 +++++++++---------- python/docs/conf.py | 2 +- python/setup.py | 2 +- python/sparknlp/__init__.py | 4 +- scripts/colab_setup.sh | 2 +- scripts/kaggle_setup.sh | 2 +- scripts/sagemaker_setup.sh | 2 +- .../scala/com/johnsnowlabs/nlp/SparkNLP.scala | 2 +- .../scala/com/johnsnowlabs/util/Build.scala | 2 +- 17 files changed, 131 insertions(+), 131 deletions(-) diff --git a/README.md b/README.md index 1e72ead60259fa..20a3e3a1fe8b8b 100644 --- a/README.md +++ b/README.md @@ -170,7 +170,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.1.0 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.1.1 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -186,7 +186,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -231,7 +231,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.1.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.1.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -270,7 +270,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.1.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.1.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -331,7 +331,7 @@ Spark NLP 5.1.0 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.1.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.1.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -376,11 +376,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` The `spark-nlp` has been published to @@ -389,11 +389,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.1 ``` @@ -403,11 +403,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.1 ``` @@ -417,11 +417,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.1 ``` @@ -435,7 +435,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` ## Scala @@ -453,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.1.0 + 5.1.1 ``` @@ -464,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.1.0 + 5.1.1 ``` @@ -475,7 +475,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.1.0 + 5.1.1 ``` @@ -486,7 +486,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.1.0 + 5.1.1 ``` @@ -496,28 +496,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.1.1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.1.1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.1.1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.1" ``` Maven @@ -539,7 +539,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.1.0 +pip install spark-nlp==5.1.1 ``` Conda: @@ -568,7 +568,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1") .getOrCreate() ``` @@ -639,7 +639,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -650,7 +650,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.1.0 +pip install spark-nlp==5.1.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -678,7 +678,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -695,7 +695,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -722,7 +722,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -745,7 +745,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -764,9 +764,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.1.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.1.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -817,7 +817,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1" } }] ``` @@ -826,7 +826,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.1.0" \ +--name "Spark NLP 5.1.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -890,7 +890,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -929,7 +929,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1") .getOrCreate() ``` @@ -943,7 +943,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` **pyspark:** @@ -956,7 +956,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` **Databricks:** @@ -1228,7 +1228,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.1.0.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.1.1.jar") .getOrCreate() ``` @@ -1237,7 +1237,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.1.0.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.1.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/build.sbt b/build.sbt index 2cf6ab5b70cc61..72ce50a828d091 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.1.0" +version := "5.1.1" (ThisBuild / scalaVersion) := scalaVer diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index d2d9df22a02743..e972df028620ce 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -201,7 +201,7 @@

{{ _section.title }}

{% highlight bash %} # Using PyPI - $ pip install spark-nlp==5.1.0 + $ pip install spark-nlp==5.1.1 # Using Anaconda/Conda $ conda install -c johnsnowlabs spark-nlp diff --git a/docs/en/concepts.md b/docs/en/concepts.md index 1807f7c44a4d30..ab4ec9de6f7633 100644 --- a/docs/en/concepts.md +++ b/docs/en/concepts.md @@ -62,7 +62,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` diff --git a/docs/en/examples.md b/docs/en/examples.md index da0c6b1f20e8cc..66ffd223fd17d8 100644 --- a/docs/en/examples.md +++ b/docs/en/examples.md @@ -16,7 +16,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 ``` ## Google Colab Notebook @@ -36,7 +36,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!bash colab.sh -p 3.2.3 -s 5.1.0 +!bash colab.sh -p 3.2.3 -s 5.1.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. diff --git a/docs/en/hardware_acceleration.md b/docs/en/hardware_acceleration.md index de6a460fd83770..59cca25b299619 100644 --- a/docs/en/hardware_acceleration.md +++ b/docs/en/hardware_acceleration.md @@ -49,7 +49,7 @@ Since the new Transformer models such as BERT for Word and Sentence embeddings a | DeBERTa Large | +477%(5.8x) | | Longformer Base | +52%(1.5x) | -Spark NLP 5.1.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.1.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/docs/en/install.md b/docs/en/install.md index 4c760d5c898a6d..07cf8b54646486 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -15,22 +15,22 @@ sidebar: ```bash # Install Spark NLP from PyPI -pip install spark-nlp==5.1.0 +pip install spark-nlp==5.1.1 # Install Spark NLP from Anacodna/Conda conda install -c johnsnowlabs spark-nlp # Load Spark NLP with Spark Shell -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 # Load Spark NLP with PySpark -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 # Load Spark NLP with Spark Submit -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 # Load Spark NLP as external JAR after compiling and building Spark NLP by `sbt assembly` -spark-shell --jars spark-nlp-assembly-5.1.0.jar +spark-shell --jars spark-nlp-assembly-5.1.1.jar ``` ## Python @@ -49,7 +49,7 @@ $ java -version # should be Java 8 (Oracle or OpenJDK) $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 ``` Of course you will need to have jupyter installed in your system: @@ -76,7 +76,7 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0")\ + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1")\ .getOrCreate() ``` @@ -91,7 +91,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp_2.12 - 5.1.0 + 5.1.1 ``` @@ -102,7 +102,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.1.0 + 5.1.1 ``` @@ -113,7 +113,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.1.0 + 5.1.1 ``` @@ -124,7 +124,7 @@ spark = SparkSession.builder \ com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.1.0 + 5.1.1 ``` @@ -134,28 +134,28 @@ spark = SparkSession.builder \ ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.1.1" ``` **spark-nlp-gpu:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.1.1" ``` **spark-nlp-silicon:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.1" ``` **spark-nlp-aarch64:** ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.1.1" ``` Maven Central: [https://mvnrepository.com/artifact/com.johnsnowlabs.nlp](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp) @@ -233,7 +233,7 @@ maven coordinates like these: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.1.0 + 5.1.1 ``` @@ -241,7 +241,7 @@ or in case of sbt: ```scala // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.1" ``` If everything went well, you can now start Spark NLP with the `m1` flag set to `true`: @@ -274,7 +274,7 @@ spark = sparknlp.start(apple_silicon=True) ## Installation for Linux Aarch64 Systems -Starting from version 5.1.0, Spark NLP supports Linux systems running on an aarch64 +Starting from version 5.1.1, Spark NLP supports Linux systems running on an aarch64 processor architecture. The necessary dependencies have been built on Ubuntu 16.04, so a recent system with an environment of at least that will be needed. @@ -318,7 +318,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -p is for pyspark # -s is for spark-nlp # by default they are set to the latest -!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.0 +!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) is a live demo on Google Colab that performs named entity recognitions and sentiment analysis by using Spark NLP pretrained pipelines. @@ -337,7 +337,7 @@ Run the following code in Kaggle Kernel and start using spark-nlp right away. ## Databricks Support -Spark NLP 5.1.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.1.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -412,7 +412,7 @@ Spark NLP 5.1.0 has been tested and is compatible with the following runtimes: 3.1. Install New -> PyPI -> `spark-nlp` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -428,7 +428,7 @@ Note: You can import these notebooks by using their URLs. ## EMR Support -Spark NLP 5.1.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.1.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -486,7 +486,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1" } } ] @@ -496,7 +496,7 @@ A sample of AWS CLI to launch EMR cluster: ```sh aws emr create-cluster \ ---name "Spark NLP 5.1.0" \ +--name "Spark NLP 5.1.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -750,7 +750,7 @@ We recommend using `conda` to manage your Python environment on Windows. Now you can use the downloaded binary by navigating to `%SPARK_HOME%\bin` and running -Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0*. +Either create a conda env for python 3.6, install *pyspark==3.3.1 spark-nlp numpy* and use Jupyter/python console, or in the same conda env you can go to spark bin for *pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1*. @@ -776,12 +776,12 @@ spark = SparkSession.builder \ .config("spark.driver.memory","16G")\ .config("spark.driver.maxResultSize", "0") \ .config("spark.kryoserializer.buffer.max", "2000M")\ - .config("spark.jars", "/tmp/spark-nlp-assembly-5.1.0.jar")\ + .config("spark.jars", "/tmp/spark-nlp-assembly-5.1.1.jar")\ .getOrCreate() ``` - You can download provided Fat JARs from each [release notes](https://github.com/JohnSnowLabs/spark-nlp/releases), please pay attention to pick the one that suits your environment depending on the device (CPU/GPU) and Apache Spark version (3.x) -- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.1.0.jar`) +- If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. (i.e., `hdfs:///tmp/spark-nlp-assembly-5.1.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/docs/en/spark_nlp.md b/docs/en/spark_nlp.md index 89161627fd7d52..fb1e34d4268e46 100644 --- a/docs/en/spark_nlp.md +++ b/docs/en/spark_nlp.md @@ -25,7 +25,7 @@ Spark NLP is built on top of **Apache Spark 3.x**. For using Spark NLP you need: **GPU (optional):** -Spark NLP 5.1.0 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: +Spark NLP 5.1.1 is built with TensorFlow 2.7.1 and the following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 diff --git a/python/README.md b/python/README.md index 1e72ead60259fa..20a3e3a1fe8b8b 100644 --- a/python/README.md +++ b/python/README.md @@ -170,7 +170,7 @@ To use Spark NLP you need the following requirements: **GPU (optional):** -Spark NLP 5.1.0 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: +Spark NLP 5.1.1 is built with ONNX 1.15.1 and TensorFlow 2.7.1 deep learning engines. The minimum following NVIDIA® software are only required for GPU support: - NVIDIA® GPU drivers version 450.80.02 or higher - CUDA® Toolkit 11.2 @@ -186,7 +186,7 @@ $ java -version $ conda create -n sparknlp python=3.7 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 ``` In Python console or Jupyter `Python3` kernel: @@ -231,7 +231,7 @@ For more examples, you can visit our dedicated [examples](https://github.com/Joh ## Apache Spark Support -Spark NLP *5.1.0* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x +Spark NLP *5.1.1* has been built on top of Apache Spark 3.4 while fully supports Apache Spark 3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x | Spark NLP | Apache Spark 2.3.x | Apache Spark 2.4.x | Apache Spark 3.0.x | Apache Spark 3.1.x | Apache Spark 3.2.x | Apache Spark 3.3.x | Apache Spark 3.4.x | |-----------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------| @@ -270,7 +270,7 @@ Find out more about `Spark NLP` versions from our [release notes](https://github ## Databricks Support -Spark NLP 5.1.0 has been tested and is compatible with the following runtimes: +Spark NLP 5.1.1 has been tested and is compatible with the following runtimes: **CPU:** @@ -331,7 +331,7 @@ Spark NLP 5.1.0 has been tested and is compatible with the following runtimes: ## EMR Support -Spark NLP 5.1.0 has been tested and is compatible with the following EMR releases: +Spark NLP 5.1.1 has been tested and is compatible with the following EMR releases: - emr-6.2.0 - emr-6.3.0 @@ -376,11 +376,11 @@ Spark NLP supports all major releases of Apache Spark 3.0.x, Apache Spark 3.1.x, ```sh # CPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` The `spark-nlp` has been published to @@ -389,11 +389,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # GPU -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:5.1.1 ``` @@ -403,11 +403,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # AArch64 -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-aarch64_2.12:5.1.1 ``` @@ -417,11 +417,11 @@ the [Maven Repository](https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/s ```sh # M1/M2 (Apple Silicon) -spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.0 +spark-shell --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.1 -pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.1 -spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.0 +spark-submit --packages com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:5.1.1 ``` @@ -435,7 +435,7 @@ set in your SparkSession: spark-shell \ --driver-memory 16g \ --conf spark.kryoserializer.buffer.max=2000M \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` ## Scala @@ -453,7 +453,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp_2.12 - 5.1.0 + 5.1.1 ``` @@ -464,7 +464,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-gpu_2.12 - 5.1.0 + 5.1.1 ``` @@ -475,7 +475,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-aarch64_2.12 - 5.1.0 + 5.1.1 ``` @@ -486,7 +486,7 @@ coordinates: com.johnsnowlabs.nlp spark-nlp-silicon_2.12 - 5.1.0 + 5.1.1 ``` @@ -496,28 +496,28 @@ coordinates: ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp" % "5.1.1" ``` **spark-nlp-gpu:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-gpu -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-gpu" % "5.1.1" ``` **spark-nlp-aarch64:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-aarch64 -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-aarch64" % "5.1.1" ``` **spark-nlp-silicon:** ```sbtshell // https://mvnrepository.com/artifact/com.johnsnowlabs.nlp/spark-nlp-silicon -libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.0" +libraryDependencies += "com.johnsnowlabs.nlp" %% "spark-nlp-silicon" % "5.1.1" ``` Maven @@ -539,7 +539,7 @@ If you installed pyspark through pip/conda, you can install `spark-nlp` through Pip: ```bash -pip install spark-nlp==5.1.0 +pip install spark-nlp==5.1.1 ``` Conda: @@ -568,7 +568,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1") .getOrCreate() ``` @@ -639,7 +639,7 @@ Use either one of the following options - Add the following Maven Coordinates to the interpreter's library list ```bash -com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` - Add a path to pre-built jar from [here](#compiled-jars) in the interpreter's library list making sure the jar is @@ -650,7 +650,7 @@ com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 Apart from the previous step, install the python module through pip ```bash -pip install spark-nlp==5.1.0 +pip install spark-nlp==5.1.1 ``` Or you can install `spark-nlp` from inside Zeppelin by using Conda: @@ -678,7 +678,7 @@ launch the Jupyter from the same Python environment: $ conda create -n sparknlp python=3.8 -y $ conda activate sparknlp # spark-nlp by default is based on pyspark 3.x -$ pip install spark-nlp==5.1.0 pyspark==3.3.1 jupyter +$ pip install spark-nlp==5.1.1 pyspark==3.3.1 jupyter $ jupyter notebook ``` @@ -695,7 +695,7 @@ export PYSPARK_PYTHON=python3 export PYSPARK_DRIVER_PYTHON=jupyter export PYSPARK_DRIVER_PYTHON_OPTS=notebook -pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 +pyspark --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` Alternatively, you can mix in using `--jars` option for pyspark + `pip install spark-nlp` @@ -722,7 +722,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Google Colab for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.1 ``` [Spark NLP quick start on Google Colab](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/quick_start_google_colab.ipynb) @@ -745,7 +745,7 @@ This script comes with the two options to define `pyspark` and `spark-nlp` versi # -s is for spark-nlp # -g will enable upgrading libcudnn8 to 8.1.0 on Kaggle for GPU usage # by default they are set to the latest -!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.0 +!wget https://setup.johnsnowlabs.com/colab.sh -O - | bash /dev/stdin -p 3.2.3 -s 5.1.1 ``` [Spark NLP quick start on Kaggle Kernel](https://www.kaggle.com/mozzie/spark-nlp-named-entity-recognition) is a live @@ -764,9 +764,9 @@ demo on Kaggle Kernel that performs named entity recognitions by using Spark NLP 3. In `Libraries` tab inside your cluster you need to follow these steps: - 3.1. Install New -> PyPI -> `spark-nlp==5.1.0` -> Install + 3.1. Install New -> PyPI -> `spark-nlp==5.1.1` -> Install - 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0` -> Install + 3.2. Install New -> Maven -> Coordinates -> `com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1` -> Install 4. Now you can attach your notebook to the cluster and use Spark NLP! @@ -817,7 +817,7 @@ A sample of your software configuration in JSON on S3 (must be public access): "spark.kryoserializer.buffer.max": "2000M", "spark.serializer": "org.apache.spark.serializer.KryoSerializer", "spark.driver.maxResultSize": "0", - "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0" + "spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1" } }] ``` @@ -826,7 +826,7 @@ A sample of AWS CLI to launch EMR cluster: ```.sh aws emr create-cluster \ ---name "Spark NLP 5.1.0" \ +--name "Spark NLP 5.1.1" \ --release-label emr-6.2.0 \ --applications Name=Hadoop Name=Spark Name=Hive \ --instance-type m4.4xlarge \ @@ -890,7 +890,7 @@ gcloud dataproc clusters create ${CLUSTER_NAME} \ --enable-component-gateway \ --metadata 'PIP_PACKAGES=spark-nlp spark-nlp-display google-cloud-bigquery google-cloud-storage' \ --initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/python/pip-install.sh \ - --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --properties spark:spark.serializer=org.apache.spark.serializer.KryoSerializer,spark:spark.driver.maxResultSize=0,spark:spark.kryoserializer.buffer.max=2000M,spark:spark.jars.packages=com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` 2. On an existing one, you need to install spark-nlp and spark-nlp-display packages from PyPI. @@ -929,7 +929,7 @@ spark = SparkSession.builder .config("spark.kryoserializer.buffer.max", "2000m") .config("spark.jsl.settings.pretrained.cache_folder", "sample_data/pretrained") .config("spark.jsl.settings.storage.cluster_tmp_dir", "sample_data/storage") - .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0") + .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1") .getOrCreate() ``` @@ -943,7 +943,7 @@ spark-shell \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` **pyspark:** @@ -956,7 +956,7 @@ pyspark \ --conf spark.kryoserializer.buffer.max=2000M \ --conf spark.jsl.settings.pretrained.cache_folder="sample_data/pretrained" \ --conf spark.jsl.settings.storage.cluster_tmp_dir="sample_data/storage" \ - --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.0 + --packages com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.1 ``` **Databricks:** @@ -1228,7 +1228,7 @@ spark = SparkSession.builder .config("spark.driver.memory", "16G") .config("spark.driver.maxResultSize", "0") .config("spark.kryoserializer.buffer.max", "2000M") - .config("spark.jars", "/tmp/spark-nlp-assembly-5.1.0.jar") + .config("spark.jars", "/tmp/spark-nlp-assembly-5.1.1.jar") .getOrCreate() ``` @@ -1237,7 +1237,7 @@ spark = SparkSession.builder version (3.0.x, 3.1.x, 3.2.x, 3.3.x, and 3.4.x) - If you are local, you can load the Fat JAR from your local FileSystem, however, if you are in a cluster setup you need to put the Fat JAR on a distributed FileSystem such as HDFS, DBFS, S3, etc. ( - i.e., `hdfs:///tmp/spark-nlp-assembly-5.1.0.jar`) + i.e., `hdfs:///tmp/spark-nlp-assembly-5.1.1.jar`) Example of using pretrained Models and Pipelines in offline: diff --git a/python/docs/conf.py b/python/docs/conf.py index b1eee1184d1664..1d6a25efa148c1 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -23,7 +23,7 @@ author = "John Snow Labs" # The full version, including alpha/beta/rc tags -release = "5.1.0" +release = "5.1.1" pyspark_version = "3.2.3" # -- General configuration --------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index 5fd1d65cd48116..67f0211073ad7a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.1.0', # Required + version='5.1.1', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/__init__.py b/python/sparknlp/__init__.py index 058e5ec39e7cfc..17a36987ffc750 100644 --- a/python/sparknlp/__init__.py +++ b/python/sparknlp/__init__.py @@ -128,7 +128,7 @@ def start(gpu=False, The initiated Spark session. """ - current_version = "5.1.0" + current_version = "5.1.1" if params is None: params = {} @@ -309,4 +309,4 @@ def version(): str The current Spark NLP version. """ - return '5.1.0' + return '5.1.1' diff --git a/scripts/colab_setup.sh b/scripts/colab_setup.sh index a3d52f1fed4521..57060d7238ebdc 100644 --- a/scripts/colab_setup.sh +++ b/scripts/colab_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.1.0" +SPARKNLP="5.1.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/kaggle_setup.sh b/scripts/kaggle_setup.sh index dd58f06f506d29..d6186155e58b04 100644 --- a/scripts/kaggle_setup.sh +++ b/scripts/kaggle_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash #default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.1.0" +SPARKNLP="5.1.1" PYSPARK="3.2.3" while getopts s:p:g option diff --git a/scripts/sagemaker_setup.sh b/scripts/sagemaker_setup.sh index 5ed18c7b2b53de..556f92d998db10 100644 --- a/scripts/sagemaker_setup.sh +++ b/scripts/sagemaker_setup.sh @@ -1,7 +1,7 @@ #!/bin/bash # Default values for pyspark, spark-nlp, and SPARK_HOME -SPARKNLP="5.1.0" +SPARKNLP="5.1.1" PYSPARK="3.2.3" echo "Setup SageMaker for PySpark $PYSPARK and Spark NLP $SPARKNLP" diff --git a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala index 76331e4847109d..e810341c598cad 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/SparkNLP.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.SparkSession object SparkNLP { - val currentVersion = "5.1.0" + val currentVersion = "5.1.1" val MavenSpark3 = s"com.johnsnowlabs.nlp:spark-nlp_2.12:$currentVersion" val MavenGpuSpark3 = s"com.johnsnowlabs.nlp:spark-nlp-gpu_2.12:$currentVersion" val MavenSparkSilicon = s"com.johnsnowlabs.nlp:spark-nlp-silicon_2.12:$currentVersion" diff --git a/src/main/scala/com/johnsnowlabs/util/Build.scala b/src/main/scala/com/johnsnowlabs/util/Build.scala index 0f37c68b7e2f99..e7c155a2038d0f 100644 --- a/src/main/scala/com/johnsnowlabs/util/Build.scala +++ b/src/main/scala/com/johnsnowlabs/util/Build.scala @@ -17,5 +17,5 @@ package com.johnsnowlabs.util object Build { - val version: String = "5.1.0" + val version: String = "5.1.1" } From 2d2edd0969534a2bedffc60b2da0b4fe65d1dbab Mon Sep 17 00:00:00 2001 From: Danilo Burbano Date: Thu, 7 Sep 2023 14:31:12 -0500 Subject: [PATCH 07/12] [SPARKNLP-891] [SPARKNLP-892] [SPARKNLP-893] Adding docs for ONNX support in AlbertXXX --- ...Spark_NLP_AlbertForQuestionAnswering.ipynb | 2389 ++++++++++++++ ..._NLP_AlbertForSequenceClassification.ipynb | 2492 ++++++++++++++ ...ark_NLP_AlbertForTokenClassification.ipynb | 2863 +++++++++++++++++ 3 files changed, 7744 insertions(+) create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForSequenceClassification.ipynb create mode 100644 examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb new file mode 100644 index 00000000000000..e8c1a45a0c5ab2 --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForQuestionAnswering.ipynb @@ -0,0 +1,2389 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "vfU3Ee88cwGj" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace%20ONNX%20in%20Spark%20NLP%20-%20AlbertForQuestionAnswering.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fM_4ix0mcwGm" + }, + "source": [ + "## Import ONNX AlbertForQuestionAnswering models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `AlbertForQuestionAnswering` is only available since in `Spark NLP 5.1.1` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import ALBERT models trained/fine-tuned for question answering via `AlbertForQuestionAnswering`. These models are usually under `Question Answering` category and have `albert` in their labels\n", + "- Reference: [TFAlbertForQuestionAnswering](https://huggingface.co/transformers/model_doc/albert#transformers.TFAlbertForQuestionAnswering)\n", + "- Some [example models](https://huggingface.co/models?filter=albert&pipeline_tag=question-answering)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EVzmVKX8cwGn" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WDSalCHsd9-z" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qSx09sNyegma", + "outputId": "856fec44-5971-4f8c-92f1-32fbdb42c835" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.6/380.6 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m46.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m56.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m29.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m90.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m33.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m10.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uFkFe1YUewJR" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [twmkn9/albert-base-v2-squad2](https://huggingface.co/twmkn9/albert-base-v2-squad2) model from HuggingFace as an example and load it as a `ORTModelForQuestionAnswering`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 281, + "referenced_widgets": [ + "ec15c2e6e2304fef92fa31fff51410af", + "ac1ddfa7ac524c2e86e21dfeebcaf4a0", + "04537a8e9d2c4752a8d801b798be69d9", + "c27ca8c29e45420aa22fec6ffc6c5d3b", + "9dbbd6b8df4b42b4858e44b39a5626a6", + "7b8f0cf0733d4672a7c7ab40cb7c3762", + "e443ec6bf38646e7895da78572ee014b", + "64754bc22132400f8f56cc2ad003329f", + "351363fdfe564e6d8c8ce9cde81f9a5b", + "ef2951ba136e42d2a69647e90ed47ce4", + "99d286f7c90c4112bcb91d990da0ce7f", + "a99c78e01d8a4742bba2e5b680f2c52d", + "5a099d52161644e682fba20573b9b623", + "d7192c1830744812b1e4a8b5c8bb31bc", + "c00ecf10983f41138d600eeed4edef4d", + "bc4c86e54f3346aaad7dc901adabf094", + "99535787eb9441b59a235f5dba61af54", + "0cb3ea3c1b4e43a582384185a09ae683", + "8c3a2943eb034f479294e2e8456da476", + "c3d8bef714264ccebc4ad59915c1261d", + "00135622144048da808a3d1e4791f591", + "c3e3ab0d58d740c2aeaf5de9de9017a4", + "7140fb718b9e4c4dbb6c388f56afc466", + "2e4e73b5633742d2bfb416cd5d5748a7", + "1105b2cdf1d347a29076bc1983c3a72d", + "68f386da4346462a8b176978056bcd5d", + "d627e9e07528463888a4f5dd115bcffe", + "ec62496e3a184dc5a3456f7aefbe7a63", + "07bfd428a7d04de193f46a957a25ff93", + "16e1d818cd0b4cbfb267fd6065f251f0", + "f5a0ff573cfa486795390aed31a60cc2", + "29963c735d984897b5244b3a6b634c73", + "60d80c81004b4bd3a48e1d5ec3b334f4", + "4dd41fcfb41942d9be4cf4cb9cbadf96", + "03c74fe335254a79b36f0dcee68d9197", + "ddc2b489d69b41269807a90e2cdbfa2c", + "afaace411a834a54a2bc1271a98b09c0", + "283e780520f94dfd91aa98c009f8d1e9", + "ad330fc387c343c885035ea455d7975d", + "d141589d19f54acea565a58ba2a5a53f", + "43272e26131347499b4fbb41427dbc1d", + "067bada14f42406ea9e604da7e924a74", + "b13112736ed54aa29e36d974532f2e45", + "7cb1923da4e44d7789511489264850f9", + "29265726de524880ad2f214a1162979b", + "c2a4cb32791548fcb3387435cb97f496", + "300aa1ee709847668688edf753293a5a", + "9a500f4168ba46ab8427acfdd1885cda", + "78cd753df9ce490fb47e75dc4ba6a9d4", + "46ea115ad6154e15ba2842c74a816d95", + "bf70868c99494bf7ae906cc205438830", + "dc5d60cd61ab4e879c32e15d9e209342", + "e7efd49f79814fa9899d97efcd22d7ae", + "11df9c4fafeb4cf99046985e6b815c89", + "70755a68b186499d980e234c5d94b7a6" + ] + }, + "id": "FtWcH9nycwGq", + "outputId": "6cb267a0-5cdd-4274-a91e-5e491a0309d6" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ec15c2e6e2304fef92fa31fff51410af", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/716 [00:00=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vX94VKVqDBys" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [mohsenfayyaz/albert-base-v2-toxicity](https://huggingface.co/mohsenfayyaz/albert-base-v2-toxicity) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 281, + "referenced_widgets": [ + "24266b6ca1684d7c8aea7604f831f37a", + "88aa6b92bc334ac1804387ebfb9d1698", + "2d4805064e4e4806bf3412bcf62eef5d", + "7a49371987b649d3b011d40cf12a5548", + "b90108495f5e4ebd86b160e9b72ec85b", + "4f160941ce524fb8b29cb37cc3ea2267", + "8294bc8b73ed4226aca0214803abadde", + "976ef61a7e554430bd590a7f0c987d9f", + "9af08f3575f048d3b1a8a12fabb3174f", + "eba4dce635714bd49743121cd96ed63e", + "8a5496d6772c4b13971f73f02b802794", + "898c13fda78b4e0a80b9c8d53c2bef2f", + "4fcf8d1395b44f1bb006ea74a92b4be0", + "769de933bd5d464e82877fb45a68d3ee", + "d3f0162dad3546729551b45c9f66e050", + "516541dfc7b448c792ac31890465696d", + "e9b36a3aa5d344acb1ee81531ea52467", + "f59f6a269350441abada8af7d1d5f626", + "e2e80b38044149f18748513be9a85853", + "0005ac959cd940a3841fb6ba74e787aa", + "426280e25c3e41debff11ff072671bfd", + "e3da27603fab48549fbaccaa6efcb1dd", + "0e77a160734d41faa32e72e942547aca", + "7bcb81b6b28b4f3c8f6e0a6decdde11b", + "0525ac061eb14796aaa026715f6fd866", + "f9399cb3dd7f4d7795e3432d8b245b8a", + "ae2be9490fb1417e99ba85948336dadc", + "af833e08d2d345a8b6b187eb7c07ec43", + "5c3141039e51476fb9caaaf6b3b26682", + "e2d512f0b9ff4084af16e1ee66652d8a", + "461807b874084172a42be62e53b72200", + "bece3465528b452784b9e3d34f4af7de", + "e4c8452323fc4ab59449b7aaa49e733b", + "50df3046398841658e8d5bda3fcded46", + "cd48e0c9b57645568124ab8a14429c29", + "847eacbeef0f43cfabb037b0df6ba6bd", + "c5b62747f75543c09253f629fbd28d00", + "18da7ca7fa444b228dc37039d43ec5b9", + "2cf866eb0b7948c1a9ea52b2fbb2df9a", + "1bf9e151053e4db68874cf594aefdb3d", + "928fbf5c19cf41c5a31352363d536be2", + "3a86c82b805440bd99ab0d55ffc81b9e", + "da10037bf7e14051b4c04fe745aa0ad6", + "5fc67f7157ed449e8823e737db23947a", + "641950a72d064a72a8776ef9ca3c302e", + "64e2ad8cc33e4edb9fd46a44b29b6c8c", + "9d3c5888b9fd4f5c82253e41599cb1d6", + "a321a3f8240f4bbba9f64ffcb8476e9b", + "156d550347b84d8480d5943e19bd1104", + "aa9a0a0db8dd4e27be22b3f5e997da60", + "92451dea04b74a8ba8d516a49904a51d", + "39d31d28e7ec42a7bff957ef218499d6", + "f1c6b446640b4c719ea90686ae6b4829", + "d5fa2619bb42411fa05c63ba40c60c9e", + "084125ccc22e4007ba6b1a6332d61cec" + ] + }, + "id": "1KO14D7FDYbV", + "outputId": "2c0aff25-7717-4cab-a671-dfdf38f96a87" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "24266b6ca1684d7c8aea7604f831f37a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/871 [00:00] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-09-06 20:14:49 (78.1 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m531.2/531.2 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1GFZq_URBeLI" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5Ti3X-BJBeLI", + "outputId": "212d9670-8ea5-47b7-d96d-1d0abde6c56c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4RPu9eDfBeLI" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `AlbertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `AlbertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gFIeDIyVBeLI" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "sequenceClassifier = AlbertForSequenceClassification\\\n", + " .loadSavedModel(EXPORT_PATH, spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")\\\n", + " .setCaseSensitive(False)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iDgUKFgyBeLI" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x5wvcWXrBeLI" + }, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(MODEL_NAME))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vSjvf8woBeLI" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Blq6dSf1BeLJ" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NR0Dge9gBeLJ" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your AlbertForSequenceClassification model from HuggingFace 🤗 loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xKvpIhytBeLJ", + "outputId": "3736b30b-f857-496a-ee56-5d826461109e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 46784\n", + "-rw-r--r-- 1 root root 47128697 Sep 6 20:17 albert_classification_onnx\n", + "-rw-r--r-- 1 root root 760289 Sep 6 20:17 albert_spp\n", + "drwxr-xr-x 3 root root 4096 Sep 6 20:17 fields\n", + "drwxr-xr-x 2 root root 4096 Sep 6 20:17 metadata\n" + ] + } + ], + "source": [ + "! ls -l {MODEL_NAME}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GylPMYsEBeLJ" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny AlbertForSequenceClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PeY7xqh7BeLJ" + }, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = AlbertForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(MODEL_NAME))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZRNL6MuYBeLJ" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wkYTgXO5BeLJ", + "outputId": "1fe4bc5b-7f91-4888-f103-4b1362c7f40f" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Non-Toxic', 'Toxic']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1ROrCoh7BeLK" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP 🚀 pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "752d8NgmBeLK", + "outputId": "118697e0-edd1-478e-b766-b6092af68b39" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+\n", + "| text| result|\n", + "+--------------------+-----------+\n", + "| I love you!|[Non-Toxic]|\n", + "|I feel lucky to b...|[Non-Toxic]|\n", + "| I hate her!| [Toxic]|\n", + "+--------------------+-----------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " sequenceClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"I love you!\"], ['I feel lucky to be here.'], ['I hate her!']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jdXhBn3wBeLK" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `AlbertForSequenceClassification` models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0005ac959cd940a3841fb6ba74e787aa": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "0525ac061eb14796aaa026715f6fd866": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e2d512f0b9ff4084af16e1ee66652d8a", + "max": 428, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_461807b874084172a42be62e53b72200", + "value": 428 + } + }, + "084125ccc22e4007ba6b1a6332d61cec": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0e77a160734d41faa32e72e942547aca": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7bcb81b6b28b4f3c8f6e0a6decdde11b", + "IPY_MODEL_0525ac061eb14796aaa026715f6fd866", + "IPY_MODEL_f9399cb3dd7f4d7795e3432d8b245b8a" + ], + "layout": "IPY_MODEL_ae2be9490fb1417e99ba85948336dadc" + } + }, + "156d550347b84d8480d5943e19bd1104": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "18da7ca7fa444b228dc37039d43ec5b9": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1bf9e151053e4db68874cf594aefdb3d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "24266b6ca1684d7c8aea7604f831f37a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_88aa6b92bc334ac1804387ebfb9d1698", + "IPY_MODEL_2d4805064e4e4806bf3412bcf62eef5d", + "IPY_MODEL_7a49371987b649d3b011d40cf12a5548" + ], + "layout": "IPY_MODEL_b90108495f5e4ebd86b160e9b72ec85b" + } + }, + "2cf866eb0b7948c1a9ea52b2fbb2df9a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2d4805064e4e4806bf3412bcf62eef5d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_976ef61a7e554430bd590a7f0c987d9f", + "max": 871, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_9af08f3575f048d3b1a8a12fabb3174f", + "value": 871 + } + }, + "39d31d28e7ec42a7bff957ef218499d6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3a86c82b805440bd99ab0d55ffc81b9e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "426280e25c3e41debff11ff072671bfd": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "461807b874084172a42be62e53b72200": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4f160941ce524fb8b29cb37cc3ea2267": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4fcf8d1395b44f1bb006ea74a92b4be0": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e9b36a3aa5d344acb1ee81531ea52467", + "placeholder": "​", + "style": "IPY_MODEL_f59f6a269350441abada8af7d1d5f626", + "value": "Downloading pytorch_model.bin: 100%" + } + }, + "50df3046398841658e8d5bda3fcded46": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_cd48e0c9b57645568124ab8a14429c29", + "IPY_MODEL_847eacbeef0f43cfabb037b0df6ba6bd", + "IPY_MODEL_c5b62747f75543c09253f629fbd28d00" + ], + "layout": "IPY_MODEL_18da7ca7fa444b228dc37039d43ec5b9" + } + }, + "516541dfc7b448c792ac31890465696d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5c3141039e51476fb9caaaf6b3b26682": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5fc67f7157ed449e8823e737db23947a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "641950a72d064a72a8776ef9ca3c302e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_64e2ad8cc33e4edb9fd46a44b29b6c8c", + "IPY_MODEL_9d3c5888b9fd4f5c82253e41599cb1d6", + "IPY_MODEL_a321a3f8240f4bbba9f64ffcb8476e9b" + ], + "layout": "IPY_MODEL_156d550347b84d8480d5943e19bd1104" + } + }, + "64e2ad8cc33e4edb9fd46a44b29b6c8c": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_aa9a0a0db8dd4e27be22b3f5e997da60", + "placeholder": "​", + "style": "IPY_MODEL_92451dea04b74a8ba8d516a49904a51d", + "value": "Downloading (…)cial_tokens_map.json: 100%" + } + }, + "769de933bd5d464e82877fb45a68d3ee": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e2e80b38044149f18748513be9a85853", + "max": 46756715, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0005ac959cd940a3841fb6ba74e787aa", + "value": 46756715 + } + }, + "7a49371987b649d3b011d40cf12a5548": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_eba4dce635714bd49743121cd96ed63e", + "placeholder": "​", + "style": "IPY_MODEL_8a5496d6772c4b13971f73f02b802794", + "value": " 871/871 [00:00<00:00, 23.7kB/s]" + } + }, + "7bcb81b6b28b4f3c8f6e0a6decdde11b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_af833e08d2d345a8b6b187eb7c07ec43", + "placeholder": "​", + "style": "IPY_MODEL_5c3141039e51476fb9caaaf6b3b26682", + "value": "Downloading (…)okenizer_config.json: 100%" + } + }, + "8294bc8b73ed4226aca0214803abadde": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "847eacbeef0f43cfabb037b0df6ba6bd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_928fbf5c19cf41c5a31352363d536be2", + "max": 760289, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3a86c82b805440bd99ab0d55ffc81b9e", + "value": 760289 + } + }, + "88aa6b92bc334ac1804387ebfb9d1698": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4f160941ce524fb8b29cb37cc3ea2267", + "placeholder": "​", + "style": "IPY_MODEL_8294bc8b73ed4226aca0214803abadde", + "value": "Downloading (…)lve/main/config.json: 100%" + } + }, + "898c13fda78b4e0a80b9c8d53c2bef2f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4fcf8d1395b44f1bb006ea74a92b4be0", + "IPY_MODEL_769de933bd5d464e82877fb45a68d3ee", + "IPY_MODEL_d3f0162dad3546729551b45c9f66e050" + ], + "layout": "IPY_MODEL_516541dfc7b448c792ac31890465696d" + } + }, + "8a5496d6772c4b13971f73f02b802794": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "92451dea04b74a8ba8d516a49904a51d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "928fbf5c19cf41c5a31352363d536be2": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "976ef61a7e554430bd590a7f0c987d9f": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9af08f3575f048d3b1a8a12fabb3174f": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9d3c5888b9fd4f5c82253e41599cb1d6": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_39d31d28e7ec42a7bff957ef218499d6", + "max": 245, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_f1c6b446640b4c719ea90686ae6b4829", + "value": 245 + } + }, + "a321a3f8240f4bbba9f64ffcb8476e9b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d5fa2619bb42411fa05c63ba40c60c9e", + "placeholder": "​", + "style": "IPY_MODEL_084125ccc22e4007ba6b1a6332d61cec", + "value": " 245/245 [00:00<00:00, 8.97kB/s]" + } + }, + "aa9a0a0db8dd4e27be22b3f5e997da60": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ae2be9490fb1417e99ba85948336dadc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "af833e08d2d345a8b6b187eb7c07ec43": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b90108495f5e4ebd86b160e9b72ec85b": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bece3465528b452784b9e3d34f4af7de": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c5b62747f75543c09253f629fbd28d00": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_da10037bf7e14051b4c04fe745aa0ad6", + "placeholder": "​", + "style": "IPY_MODEL_5fc67f7157ed449e8823e737db23947a", + "value": " 760k/760k [00:00<00:00, 2.69MB/s]" + } + }, + "cd48e0c9b57645568124ab8a14429c29": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2cf866eb0b7948c1a9ea52b2fbb2df9a", + "placeholder": "​", + "style": "IPY_MODEL_1bf9e151053e4db68874cf594aefdb3d", + "value": "Downloading spiece.model: 100%" + } + }, + "d3f0162dad3546729551b45c9f66e050": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_426280e25c3e41debff11ff072671bfd", + "placeholder": "​", + "style": "IPY_MODEL_e3da27603fab48549fbaccaa6efcb1dd", + "value": " 46.8M/46.8M [00:00<00:00, 93.2MB/s]" + } + }, + "d5fa2619bb42411fa05c63ba40c60c9e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "da10037bf7e14051b4c04fe745aa0ad6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2d512f0b9ff4084af16e1ee66652d8a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e2e80b38044149f18748513be9a85853": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e3da27603fab48549fbaccaa6efcb1dd": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e4c8452323fc4ab59449b7aaa49e733b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e9b36a3aa5d344acb1ee81531ea52467": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eba4dce635714bd49743121cd96ed63e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f1c6b446640b4c719ea90686ae6b4829": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f59f6a269350441abada8af7d1d5f626": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "f9399cb3dd7f4d7795e3432d8b245b8a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bece3465528b452784b9e3d34f4af7de", + "placeholder": "​", + "style": "IPY_MODEL_e4c8452323fc4ab59449b7aaa49e733b", + "value": " 428/428 [00:00<00:00, 14.7kB/s]" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb new file mode 100644 index 00000000000000..b63c604624c43a --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForTokenClassification.ipynb @@ -0,0 +1,2863 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "t609_kwpJbwZ" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace%20ONNX%20in%20Spark%20NLP%20-%20AlbertForTokenClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zAKzkwH5Jbwf" + }, + "source": [ + "## Import ONNX AlbertForTokenClassification models from HuggingFace 🤗 into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `AlbertForTokenClassification` is only available since in `Spark NLP 5.1.1` and after. So please make sure you have upgraded to the latest Spark NLP release- You can import ALBERT models trained/fine-tuned for token classification via `AlbertForTokenClassification` or `TFAlbertForTokenClassification`. These models are usually under `Token Classification` category and have `albert` in their labels\n", + "- Reference: [TFAlbertForTokenClassification](https://huggingface.co/transformers/model_doc/albert.html#tfalbertfortokenclassification)\n", + "- Some [example models](https://huggingface.co/models?filter=albert&pipeline_tag=token-classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Wd_fyDVJbwg" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dyDoTKBbJbwg" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Mm0OMPsCJbwh", + "outputId": "86620f0c-572d-43e9-ed5d-709a0cf47f71" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m44.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.6/380.6 kB\u001b[0m \u001b[31m34.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m67.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m70.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m18.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m53.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m44.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum sentencepiece" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MvbxodR1Jbwi" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [HooshvareLab/albert-fa-zwnj-base-v2-ner](https://huggingface.co/HooshvareLab/albert-fa-zwnj-base-v2-ner) model from HuggingFace as an example\n", + "- In addition to `TFAlbertForTokenClassification` we also need to save the `AlbertTokenizer`. This is the same for every model, these are assets needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 313, + "referenced_widgets": [ + "c5f0444f81694122b1af3734423d115f", + "4041a4ab172440818c3ca2b90afee721", + "bc1f726515f44781bdb449a1f23e7420", + "ddbee43acfdc49f19484da9e8e9cf035", + "248cfb35234841b08d79dfb830fec33c", + "38f0363e671a46ff9e412a66f448fcc2", + "8a885bab9ea740cab55ba6bf3f7b79d6", + "7cf1c4efafaa493f8169f0450aec790b", + "71c7b3dac0194c24abc065a1f8e32a29", + "2e9490e95e6e4e64ba65ad5e67cbc5d3", + "c61e561ed33543f8b4a70af112ca2277", + "05fec899b57a403c8762daee89e2f2b9", + "c4835ed3fc64462a909c61668c622a4f", + "d4191e35552d458483659e8c8c26ba66", + "87609470a48b4740a11573a0209a3431", + "7f4ed8a7559e4b539113fa7485b68a02", + "ef4f23c5f83a45889a7c0c30efde5444", + "a2624fc48a144e10b7d93404b0963df6", + "597fd645c02d490d94e2544695e9cc51", + "1b22e98bc60c4cbd8c4e067688921a1e", + "32e1910e9af748ef9536f9fc201ae4fb", + "b14654e692164b89845fc75b0d927bb7", + "efdfe67562f04ea3a0ccdc2ec9fd4121", + "b9739959867d4d9da6cc94c2685266e7", + "cb9f299efc0a4bc69cb92c9a2aa72abe", + "8a28443ba15e40a793407ca15dcd41ec", + "b63759abba4d400d925b8e76af597abc", + "662a9f7471b2436897f5249403d736eb", + "6dc6e6c109a74fe8b7459a5a6f0ad5c9", + "4f206f1c85704190a4a273215322c725", + "37355cfc489b4266910e80c2f8a5dd79", + "902c2c97d8b349ca93a95bd5d063fcfd", + "1a41f7ce7c234f3fbac73ce021bdcc35", + "5133176f6f354b038683a3804df9973e", + "35864eeb1e7147afa220ebc1c7e1aab5", + "fc0e18d517f94331996ef6cba7d829cb", + "58cf3b0578744138ad2f68f26961fb7e", + "8d2dea9b62054f7d8fcbdb142427a70e", + "f14deafdab2948ad98c0b5fb5e37ba56", + "968aead46d9e4f76b56db7b9b7ab5732", + "154f44fb89384497b3c562ca3bd1d16f", + "f0130c5e27294c08953386e9b24c0a9d", + "9e04c3d5b691406ead7697edf9cbc299", + "3f15395b877342e2ab8c6516c13c54c7", + "aba730b0080344fb842a90b2f7c71488", + "a19c476677f04b969b512c32931f885e", + "f272b63049ad4156807ca08212fac696", + "a8879aa1c5904e22a620ac38ce72130c", + "120c718274714470b4a90d1c6bab8046", + "0268cdf911084e9e9e267120134a1f21", + "ce07d827ddf4442bb5510223918a63cd", + "1612259e50ce4e6e9f4a32b7a8c7b630", + "4b3019aa8cc14c84a9945647996b840d", + "d28c2514eba344c3af0b7c68297625e1", + "b461672f7e884d8aa7772b0f565ad32d", + "57af3cdfae814885a10d3757cd59439c", + "1a538d88f95d43fea029ae21763da492", + "4905f6d24cfd498688bb2e5e63a78c19", + "a604832621ec44e4ad79dfdd3e243241", + "944fbbcf27854d299aa18033f14f9b4a", + "736067dc3ccc4d9fae0cb34ab66cbbf3", + "b76767ba240b486ebe5b0d89194b9663", + "259cb7680bfb40e09c2a5d1a595fc8d7", + "5d75d662f9ac4fd9997fe7ffe7cbf479", + "0af2f0c896594863bf62d4de2bc32550", + "b4fea60ce4d54978b6d062f10f42676d" + ] + }, + "id": "69dqor6oJbwj", + "outputId": "3e186a15-0df1-42fa-d5b4-d1dd2d9328ed" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c5f0444f81694122b1af3734423d115f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading (…)lve/main/config.json: 0%| | 0.00/1.57k [00:00 Date: Fri, 8 Sep 2023 18:45:25 +0200 Subject: [PATCH 08/12] Fix misspelling [skip test] --- docs/_layouts/landing.html | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html index e972df028620ce..ae974c683221b7 100755 --- a/docs/_layouts/landing.html +++ b/docs/_layouts/landing.html @@ -224,8 +224,8 @@

Transformers at Scale

Unlock the power of Large Language Models with Spark NLP 🚀, the only open-source library that delivers cutting-edge transformers for production such as BERT, CamemBERT, ALBERT, ELECTRA, XLNet, DistilBERT, RoBERTa, DeBERTa, - XLM-RoBERTa, Longformer, ELMO, Universal Sentence Encoder, Facebook BART, Instructor Embeddings, E5 Embeddings, Google T5, MarianMT, OpenAI GPT2, - Google ViT, ASR Wav2Vec2 and many more not only to Python, and R but also to JVM ecosystem (Java, Scala, and Kotlin) at scale by extending Apache Spark natively + XLM-RoBERTa, Longformer, ELMO, Universal Sentence Encoder, Facebook BART, Instructor Embeddings, E5 Embeddings, MPNet Embeddings, Google T5, MarianMT, OpenAI GPT2, + Google ViT, ASR Wav2Vec2, OpenAI Whisper and many more not only to Python, and R but also to JVM ecosystem (Java and Scala) at scale by extending Apache Spark natively
@@ -335,7 +335,7 @@

NLP Features

  • Vision Transformer (Google ViT) Image Classification
  • Microsoft Swin Transformer Image Classification
  • Facebook ConvNext Image Classification
  • -
  • Automatic Speech Recognition (OpeAI Whisper, Wav2Vec2 & HuBERT)
  • +
  • Automatic Speech Recognition (OpenAI Whisper, Wav2Vec2 & HuBERT)
  • Easy ONNX and TensorFlow integrations
  • GPU Support
  • Full integration with Spark ML functions
  • From 04f2f7ec82d6eff01329668e4de6886cb50e5a88 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 11 Sep 2023 19:16:37 +0500 Subject: [PATCH 09/12] Fixing onnx saving path bug (#13959) * fixing onnx write issue on windows * fixing indentation * fixing formatting * fixing formatting * final formatting fix * Fix onnx saving bug --------- Co-authored-by: Devin Ha Co-authored-by: Maziyar Panahi --- src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala | 8 +++++--- .../nlp/annotators/audio/WhisperForCTCTest.scala | 2 +- .../nlp/embeddings/E5EmbeddingsTestSpec.scala | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala index 410e5aea102044..1a4b9b925b033f 100644 --- a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala +++ b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala @@ -58,9 +58,12 @@ class OnnxWrapper(var onnxModel: Array[Byte]) extends Serializable { .toString // 2. Save onnx model - val onnxFile = Paths.get(tmpFolder, file).toString - FileUtils.writeByteArrayToFile(new File(onnxFile), onnxModel) + val fileName = Paths.get(file).getFileName.toString + val onnxFile = Paths + .get(tmpFolder, fileName) + .toString + FileUtils.writeByteArrayToFile(new File(onnxFile), onnxModel) // 4. Zip folder if (zip) ZipArchiveUtil.zip(tmpFolder, file) @@ -163,5 +166,4 @@ object OnnxWrapper { encoder: OnnxWrapper, decoder: OnnxWrapper, decoderWithPast: OnnxWrapper) - } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala index 0d37be98bcb1d7..2a4592db3c76d0 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/audio/WhisperForCTCTest.scala @@ -32,7 +32,7 @@ class WhisperForCTCTest extends AnyFlatSpec with WhisperForCTCBehaviors { // Needs to be added manually lazy val modelTf: WhisperForCTC = WhisperForCTC - .pretrained("asr_whisper_tiny") + .pretrained("asr_whisper_tiny", "xx") .setInputCols("audio_assembler") .setOutputCol("document") diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala index 1e4ac3c697c10e..a63f3e1a0cec70 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/E5EmbeddingsTestSpec.scala @@ -18,7 +18,7 @@ package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper -import com.johnsnowlabs.tags.{SlowTest, FastTest} +import com.johnsnowlabs.tags.{SlowTest} import org.apache.spark.ml.Pipeline import org.scalatest.flatspec.AnyFlatSpec From 5567bfc6b2c830735bddfc30d1b820480a879cbf Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Mon, 11 Sep 2023 16:21:14 +0200 Subject: [PATCH 10/12] CHANGELOG 5.1.1 [run doc] --- CHANGELOG | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 21259f88ed2a4d..83b604fbc6e1a8 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,22 @@ +======== +5.1.1 +======== +---------------- +New Features & Enhancements +---------------- +* **NEW:** Introducing support for ONNX Runtime in MPNet embedding annotator +* **NEW:** Introducing support for ONNX Runtime in AlbertForTokenClassification annotator +* **NEW:** Introducing support for ONNX Runtime in AlbertForSequenceClassification annotator +* **NEW:** Introducing support for ONNX Runtime in AlbertForQuestionAnswering annotator +* Implement `getVectors` feature in Word2VecModel, Doc2VecModel, and WordEmbeddingsModel annotators. This new feature allows access to the entire tokens and their vectors in the loaded model. + +---------------- +Bug Fixes +---------------- +* Fix how to save and load `Whisper` models +* Fix saving ONNX model on Windows operating system + + ======== 5.1.0 ======== From 010ad9f5c958ced66542900f1dfd520d996cf76e Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 11 Sep 2023 15:42:23 +0000 Subject: [PATCH 11/12] Update Scala and Python APIs --- docs/api/com/index.html | 8 +-- .../com/johnsnowlabs/client/CloudClient.html | 8 +-- .../com/johnsnowlabs/client/CloudManager.html | 8 +-- .../johnsnowlabs/client/CloudResources$.html | 8 +-- .../com/johnsnowlabs/client/CloudStorage.html | 8 +-- .../client/aws/AWSAnonymousCredentials.html | 8 +-- .../client/aws/AWSBasicCredentials.html | 8 +-- .../johnsnowlabs/client/aws/AWSClient.html | 8 +-- .../client/aws/AWSCredentialsProvider.html | 8 +-- .../johnsnowlabs/client/aws/AWSGateway.html | 8 +-- .../client/aws/AWSProfileCredentials.html | 8 +-- .../client/aws/AWSTokenCredentials.html | 8 +-- .../client/aws/CredentialParams.html | 8 +-- .../johnsnowlabs/client/aws/Credentials.html | 8 +-- .../com/johnsnowlabs/client/aws/index.html | 8 +-- .../client/azure/AzureClient.html | 8 +-- .../client/azure/AzureGateway.html | 8 +-- .../com/johnsnowlabs/client/azure/index.html | 8 +-- .../johnsnowlabs/client/gcp/GCPClient.html | 8 +-- .../johnsnowlabs/client/gcp/GCPGateway.html | 8 +-- .../com/johnsnowlabs/client/gcp/index.html | 8 +-- docs/api/com/johnsnowlabs/client/index.html | 8 +-- .../client/util/CloudHelper$.html | 8 +-- .../com/johnsnowlabs/client/util/index.html | 8 +-- .../johnsnowlabs/collections/SearchTrie$.html | 8 +-- .../johnsnowlabs/collections/SearchTrie.html | 8 +-- .../collections/StorageSearchTrie$.html | 8 +-- .../collections/StorageSearchTrie.html | 8 +-- .../com/johnsnowlabs/collections/index.html | 8 +-- docs/api/com/johnsnowlabs/index.html | 8 +-- docs/api/com/johnsnowlabs/ml/ai/DeBerta.html | 8 +-- .../ml/ai/MergeTokenStrategy$.html | 8 +-- .../johnsnowlabs/ml/ai/OpenAICompletion.html | 8 +-- .../johnsnowlabs/ml/ai/OpenAIEmbeddings.html | 8 +-- docs/api/com/johnsnowlabs/ml/ai/index.html | 8 +-- .../com/johnsnowlabs/ml/ai/model/Choice.html | 8 +-- .../ml/ai/model/CompletionResponse.html | 8 +-- .../ml/ai/model/EmbeddingData.html | 8 +-- .../ml/ai/model/TextEmbeddingResponse.html | 8 +-- .../com/johnsnowlabs/ml/ai/model/Usage.html | 8 +-- .../johnsnowlabs/ml/ai/model/UsageData.html | 8 +-- .../com/johnsnowlabs/ml/ai/model/index.html | 8 +-- .../ml/ai/util/Generation/Generate.html | 8 +-- .../ai/util/Generation/GenerationConfig.html | 8 +-- .../ml/ai/util/Generation/Logit/Logit.html | 8 +-- .../ForcedTokenLogitProcessor.html | 8 +-- .../Logit/LogitProcess/LogitProcessor.html | 8 +-- .../LogitProcess/MinLengthLogitProcessor.html | 8 +-- .../NoRepeatNgramsLogitProcessor.html | 8 +-- .../RepetitionPenaltyLogitProcessor.html | 8 +-- .../LogitProcess/SuppressLogitProcessor.html | 8 +-- .../Generation/Logit/LogitProcess/index.html | 8 +-- .../Generation/Logit/LogitProcessorList.html | 8 +-- .../Logit/LogitWarper/LogitWarper.html | 8 +-- .../LogitWarper/TemperatureLogitWarper.html | 8 +-- .../Logit/LogitWarper/TopKLogitWarper.html | 8 +-- .../Logit/LogitWarper/TopPLogitWarper.html | 8 +-- .../Generation/Logit/LogitWarper/index.html | 8 +-- .../ml/ai/util/Generation/Logit/index.html | 8 +-- .../Generation/Search/BeamHypotheses.html | 8 +-- .../ai/util/Generation/Search/BeamScorer.html | 8 +-- .../Generation/Search/BeamSearchScorer.html | 8 +-- .../ml/ai/util/Generation/Search/index.html | 8 +-- .../ml/ai/util/Generation/index.html | 8 +-- .../com/johnsnowlabs/ml/ai/util/index.html | 8 +-- docs/api/com/johnsnowlabs/ml/crf/Attr.html | 8 +-- .../com/johnsnowlabs/ml/crf/AttrFeature.html | 8 +-- .../api/com/johnsnowlabs/ml/crf/AttrStat.html | 8 +-- .../com/johnsnowlabs/ml/crf/CrfDataset.html | 8 +-- .../com/johnsnowlabs/ml/crf/CrfParams.html | 8 +-- .../johnsnowlabs/ml/crf/DatasetEncoder.html | 8 +-- .../johnsnowlabs/ml/crf/DatasetMetadata.html | 8 +-- .../johnsnowlabs/ml/crf/DatasetReader$.html | 8 +-- .../johnsnowlabs/ml/crf/EdgeCalculator$.html | 8 +-- .../com/johnsnowlabs/ml/crf/FbCalculator.html | 8 +-- .../api/com/johnsnowlabs/ml/crf/Instance.html | 8 +-- .../johnsnowlabs/ml/crf/InstanceLabels.html | 8 +-- .../johnsnowlabs/ml/crf/L2DecayStrategy.html | 8 +-- .../johnsnowlabs/ml/crf/LinearChainCrf.html | 8 +-- .../ml/crf/LinearChainCrfModel.html | 8 +-- .../ml/crf/SerializedDatasetMetadata.html | 8 +-- .../ml/crf/SerializedLinearChainCrfModel.html | 8 +-- .../ml/crf/SparseArray$$SeqWrapper.html | 8 +-- .../com/johnsnowlabs/ml/crf/SparseArray$.html | 8 +-- .../com/johnsnowlabs/ml/crf/SparseArray.html | 8 +-- .../ml/crf/TextSentenceAttrs.html | 8 +-- .../ml/crf/TextSentenceLabels.html | 8 +-- .../com/johnsnowlabs/ml/crf/Transition.html | 8 +-- .../com/johnsnowlabs/ml/crf/VectorMath$.html | 8 +-- .../com/johnsnowlabs/ml/crf/WordAttrs.html | 8 +-- docs/api/com/johnsnowlabs/ml/crf/index.html | 8 +-- docs/api/com/johnsnowlabs/ml/index.html | 8 +-- .../OnnxWrapper$$EncoderDecoderWrappers.html | 8 +-- .../johnsnowlabs/ml/onnx/OnnxWrapper$.html | 8 +-- .../com/johnsnowlabs/ml/onnx/OnnxWrapper.html | 8 +-- .../johnsnowlabs/ml/onnx/ReadOnnxModel.html | 10 +-- ...sources$$implicits$$OnnxSessionResult.html | 8 +-- .../ml/onnx/TensorResources$$implicits$.html | 8 +-- .../ml/onnx/TensorResources$.html | 8 +-- .../johnsnowlabs/ml/onnx/TensorResources.html | 8 +-- .../johnsnowlabs/ml/onnx/WriteOnnxModel.html | 10 +-- docs/api/com/johnsnowlabs/ml/onnx/index.html | 8 +-- .../tensorflow/ClassifierDatasetEncoder.html | 8 +-- .../ClassifierDatasetEncoderParams.html | 8 +-- .../ml/tensorflow/DatasetEncoderParams.html | 8 +-- .../johnsnowlabs/ml/tensorflow/Logging.html | 8 +-- .../ml/tensorflow/ModelSignature.html | 8 +-- .../johnsnowlabs/ml/tensorflow/NerBatch$.html | 8 +-- .../johnsnowlabs/ml/tensorflow/NerBatch.html | 8 +-- .../ml/tensorflow/NerDatasetEncoder.html | 8 +-- .../ml/tensorflow/ReadTensorflowModel.html | 8 +-- .../ml/tensorflow/SentenceGrouper.html | 8 +-- .../ml/tensorflow/TensorResources$.html | 8 +-- .../ml/tensorflow/TensorResources.html | 8 +-- .../ml/tensorflow/TensorflowClassifier.html | 8 +-- .../ml/tensorflow/TensorflowWrapper$.html | 8 +-- .../ml/tensorflow/TensorflowWrapper.html | 8 +-- .../johnsnowlabs/ml/tensorflow/Variables.html | 8 +-- .../ml/tensorflow/WriteTensorflowModel.html | 8 +-- .../com/johnsnowlabs/ml/tensorflow/index.html | 8 +-- .../sentencepiece/ReadSentencePieceModel.html | 8 +-- .../sentencepiece/SentencePieceException.html | 8 +-- .../sentencepiece/SentencePieceProcessor.html | 8 +-- .../sentencepiece/SentencePieceWrapper$.html | 8 +-- .../WriteSentencePieceModel.html | 8 +-- .../ml/tensorflow/sentencepiece/index.html | 8 +-- ...delSignatureConstants$$AttentionMask$.html | 8 +-- ...lSignatureConstants$$AttentionMaskV1$.html | 8 +-- ...SignatureConstants$$AudioValuesInput$.html | 8 +-- ...s$$CachedDecoderEncoderAttentionMask$.html | 8 +-- ...stants$$CachedDecoderEncoderInputIds$.html | 8 +-- ...eConstants$$CachedDecoderInputCache1$.html | 8 +-- ...eConstants$$CachedDecoderInputCache2$.html | 8 +-- ...tureConstants$$CachedDecoderInputIds$.html | 8 +-- ...natureConstants$$CachedEncoderOutput$.html | 8 +-- ...gnatureConstants$$CachedLogitsOutput$.html | 8 +-- ...delSignatureConstants$$CachedOutPut2$.html | 8 +-- ...delSignatureConstants$$CachedOutput1$.html | 8 +-- .../sign/ModelSignatureConstants$$DType$.html | 8 +-- ...atureConstants$$DecoderAttentionMask$.html | 8 +-- ...nstants$$DecoderEncoderAttentionMask$.html | 8 +-- ...ureConstants$$DecoderEncoderInputIds$.html | 8 +-- ...lSignatureConstants$$DecoderInputIds$.html | 8 +-- ...delSignatureConstants$$DecoderOutput$.html | 8 +-- .../ModelSignatureConstants$$DimCount$.html | 8 +-- ...atureConstants$$EncoderAttentionMask$.html | 8 +-- ...gnatureConstants$$EncoderContextMask$.html | 8 +-- ...lSignatureConstants$$EncoderInputIds$.html | 8 +-- ...delSignatureConstants$$EncoderOutput$.html | 8 +-- ...lSignatureConstants$$EndLogitsOutput$.html | 8 +-- ...ignatureConstants$$InitCachedOutPut2$.html | 8 +-- ...ignatureConstants$$InitCachedOutput1$.html | 8 +-- ...nts$$InitDecoderEncoderAttentionMask$.html | 8 +-- ...onstants$$InitDecoderEncoderInputIds$.html | 8 +-- ...natureConstants$$InitDecoderInputIds$.html | 8 +-- ...SignatureConstants$$InitLogitsOutput$.html | 8 +-- .../ModelSignatureConstants$$InputIds$.html | 8 +-- .../ModelSignatureConstants$$InputIdsV1$.html | 8 +-- ...lSignatureConstants$$LastHiddenState$.html | 8 +-- ...ignatureConstants$$LastHiddenStateV1$.html | 8 +-- ...odelSignatureConstants$$LogitsOutput$.html | 8 +-- .../sign/ModelSignatureConstants$$Name$.html | 8 +-- ...SignatureConstants$$PixelValuesInput$.html | 8 +-- ...odelSignatureConstants$$PoolerOutput$.html | 8 +-- ...elSignatureConstants$$PoolerOutputV1$.html | 8 +-- ...elSignatureConstants$$SerializedSize$.html | 8 +-- ...odelSignatureConstants$$ShapeDimList$.html | 8 +-- ...ignatureConstants$$StartLogitsOutput$.html | 8 +-- ...lSignatureConstants$$TFInfoDescriptor.html | 8 +-- ...lSignatureConstants$$TFInfoNameMapper.html | 8 +-- ...stants$$TapasLogitsAggregationOutput$.html | 8 +-- ...ignatureConstants$$TapasLogitsOutput$.html | 8 +-- ...odelSignatureConstants$$TokenTypeIds$.html | 8 +-- ...elSignatureConstants$$TokenTypeIdsV1$.html | 8 +-- .../sign/ModelSignatureConstants$.html | 8 +-- .../sign/ModelSignatureManager$.html | 8 +-- .../ml/tensorflow/sign/index.html | 8 +-- ...inAlg$$implicits$$ExtendedDenseMatrix.html | 8 +-- .../ml/util/LinAlg$$implicits$.html | 8 +-- .../api/com/johnsnowlabs/ml/util/LinAlg$.html | 8 +-- .../ml/util/LoadExternalModel$.html | 8 +-- .../com/johnsnowlabs/ml/util/ModelArch$.html | 8 +-- .../com/johnsnowlabs/ml/util/ModelEngine.html | 8 +-- docs/api/com/johnsnowlabs/ml/util/ONNX$.html | 8 +-- .../com/johnsnowlabs/ml/util/PyTorch$.html | 8 +-- .../com/johnsnowlabs/ml/util/TensorFlow$.html | 8 +-- .../com/johnsnowlabs/ml/util/Unknown$.html | 8 +-- docs/api/com/johnsnowlabs/ml/util/index.html | 8 +-- .../johnsnowlabs/nlp/ActivationFunction$.html | 8 +-- .../nlp/Annotation$$AnnotationContainer.html | 8 +-- ...nnotation$$extractors$$AnnotationData.html | 8 +-- .../nlp/Annotation$$extractors$.html | 8 +-- .../api/com/johnsnowlabs/nlp/Annotation$.html | 8 +-- docs/api/com/johnsnowlabs/nlp/Annotation.html | 8 +-- .../AnnotationAudio$$AnnotationContainer.html | 8 +-- .../nlp/AnnotationAudio$$AudioFields.html | 8 +-- .../johnsnowlabs/nlp/AnnotationAudio$.html | 8 +-- .../com/johnsnowlabs/nlp/AnnotationAudio.html | 8 +-- .../AnnotationImage$$AnnotationContainer.html | 8 +-- .../nlp/AnnotationImage$$ImageFields.html | 8 +-- .../johnsnowlabs/nlp/AnnotationImage$.html | 8 +-- .../com/johnsnowlabs/nlp/AnnotationImage.html | 8 +-- .../johnsnowlabs/nlp/AnnotatorApproach.html | 8 +-- .../com/johnsnowlabs/nlp/AnnotatorModel.html | 8 +-- .../com/johnsnowlabs/nlp/AnnotatorType$.html | 8 +-- .../com/johnsnowlabs/nlp/AudioAssembler$.html | 8 +-- .../com/johnsnowlabs/nlp/AudioAssembler.html | 8 +-- docs/api/com/johnsnowlabs/nlp/CanBeLazy.html | 8 +-- docs/api/com/johnsnowlabs/nlp/Doc2Chunk$.html | 8 +-- docs/api/com/johnsnowlabs/nlp/Doc2Chunk.html | 8 +-- .../johnsnowlabs/nlp/DocumentAssembler$.html | 8 +-- .../johnsnowlabs/nlp/DocumentAssembler.html | 8 +-- .../johnsnowlabs/nlp/EmbeddingsFinisher$.html | 8 +-- .../johnsnowlabs/nlp/EmbeddingsFinisher.html | 8 +-- .../com/johnsnowlabs/nlp/FeaturesReader.html | 8 +-- .../com/johnsnowlabs/nlp/FeaturesWriter.html | 8 +-- docs/api/com/johnsnowlabs/nlp/Finisher$.html | 8 +-- docs/api/com/johnsnowlabs/nlp/Finisher.html | 8 +-- .../com/johnsnowlabs/nlp/GraphFinisher.html | 8 +-- .../nlp/HasAudioFeatureProperties.html | 8 +-- .../johnsnowlabs/nlp/HasBatchedAnnotate.html | 8 +-- .../nlp/HasBatchedAnnotateAudio.html | 8 +-- .../nlp/HasBatchedAnnotateImage.html | 8 +-- .../nlp/HasCandidateLabelsProperties.html | 8 +-- .../nlp/HasCaseSensitiveProperties.html | 8 +-- .../HasClassifierActivationProperties.html | 8 +-- .../nlp/HasEnableCachingProperties.html | 8 +-- docs/api/com/johnsnowlabs/nlp/HasEngine.html | 8 +-- .../api/com/johnsnowlabs/nlp/HasFeatures.html | 8 +-- .../nlp/HasGeneratorProperties.html | 8 +-- .../nlp/HasImageFeatureProperties.html | 8 +-- .../nlp/HasInputAnnotationCols.html | 8 +-- .../nlp/HasMultipleInputAnnotationCols.html | 8 +-- .../nlp/HasOutputAnnotationCol.html | 8 +-- .../nlp/HasOutputAnnotatorType.html | 8 +-- .../com/johnsnowlabs/nlp/HasPretrained.html | 8 +-- .../HasProtectedParams$ProtectedParam.html | 8 +-- .../johnsnowlabs/nlp/HasProtectedParams.html | 8 +-- .../com/johnsnowlabs/nlp/HasRecursiveFit.html | 8 +-- .../nlp/HasRecursiveTransform.html | 8 +-- .../johnsnowlabs/nlp/HasSimpleAnnotate.html | 8 +-- .../api/com/johnsnowlabs/nlp/IAnnotation.html | 8 +-- .../com/johnsnowlabs/nlp/ImageAssembler$.html | 8 +-- .../com/johnsnowlabs/nlp/ImageAssembler.html | 8 +-- .../com/johnsnowlabs/nlp/JavaAnnotation.html | 8 +-- .../com/johnsnowlabs/nlp/LightPipeline.html | 8 +-- .../nlp/MultiDocumentAssembler$.html | 8 +-- .../nlp/MultiDocumentAssembler.html | 8 +-- .../nlp/ParamsAndFeaturesReadable.html | 8 +-- .../nlp/ParamsAndFeaturesWritable.html | 8 +-- .../com/johnsnowlabs/nlp/RawAnnotator.html | 8 +-- .../johnsnowlabs/nlp/RecursivePipeline.html | 8 +-- .../nlp/RecursivePipelineModel.html | 8 +-- docs/api/com/johnsnowlabs/nlp/SparkNLP$.html | 8 +-- .../com/johnsnowlabs/nlp/TableAssembler$.html | 8 +-- .../com/johnsnowlabs/nlp/TableAssembler.html | 8 +-- .../com/johnsnowlabs/nlp/TokenAssembler$.html | 8 +-- .../com/johnsnowlabs/nlp/TokenAssembler.html | 8 +-- .../nlp/annotators/Chunk2Doc$.html | 8 +-- .../nlp/annotators/Chunk2Doc.html | 8 +-- .../nlp/annotators/ChunkTokenizer$.html | 8 +-- .../nlp/annotators/ChunkTokenizer.html | 8 +-- .../nlp/annotators/ChunkTokenizerModel$.html | 8 +-- .../nlp/annotators/ChunkTokenizerModel.html | 8 +-- .../johnsnowlabs/nlp/annotators/Chunker$.html | 8 +-- .../johnsnowlabs/nlp/annotators/Chunker.html | 8 +-- .../nlp/annotators/Date2Chunk$.html | 8 +-- .../nlp/annotators/Date2Chunk.html | 8 +-- .../nlp/annotators/DateMatcher$.html | 8 +-- .../nlp/annotators/DateMatcher.html | 8 +-- .../nlp/annotators/DateMatcherTranslator.html | 8 +-- .../DateMatcherTranslatorPolicy.html | 8 +-- .../nlp/annotators/DateMatcherUtils.html | 8 +-- .../nlp/annotators/DocumentNormalizer$.html | 8 +-- .../nlp/annotators/DocumentNormalizer.html | 8 +-- .../nlp/annotators/EnglishStemmer$.html | 8 +-- .../nlp/annotators/GraphExtraction.html | 8 +-- .../nlp/annotators/Lemmatizer$.html | 8 +-- .../nlp/annotators/Lemmatizer.html | 8 +-- .../nlp/annotators/LemmatizerModel$.html | 8 +-- .../nlp/annotators/LemmatizerModel.html | 8 +-- .../nlp/annotators/LookAroundManager$.html | 8 +-- .../nlp/annotators/MultiDateMatcher$.html | 8 +-- .../nlp/annotators/MultiDateMatcher.html | 8 +-- .../nlp/annotators/MultiDatePolicy$.html | 8 +-- .../nlp/annotators/NGramGenerator$.html | 8 +-- .../nlp/annotators/NGramGenerator.html | 8 +-- .../nlp/annotators/Normalizer$.html | 8 +-- .../nlp/annotators/Normalizer.html | 8 +-- .../nlp/annotators/NormalizerModel$.html | 8 +-- ...alizerModel$TokenizerAndNormalizerMap.html | 8 +-- .../nlp/annotators/NormalizerModel.html | 8 +-- .../annotators/PretrainedAnnotations$.html | 8 +-- .../ReadablePretrainedLemmatizer.html | 8 +-- ...adablePretrainedStopWordsCleanerModel.html | 8 +-- .../ReadablePretrainedTextMatcher.html | 8 +-- .../ReadablePretrainedTokenizer.html | 8 +-- .../nlp/annotators/RecursiveTokenizer.html | 8 +-- .../annotators/RecursiveTokenizerModel$.html | 8 +-- .../annotators/RecursiveTokenizerModel.html | 8 +-- .../nlp/annotators/RegexMatcher$.html | 8 +-- .../nlp/annotators/RegexMatcher.html | 8 +-- .../nlp/annotators/RegexMatcherModel$.html | 8 +-- .../nlp/annotators/RegexMatcherModel.html | 8 +-- .../nlp/annotators/RegexTokenizer$.html | 8 +-- .../nlp/annotators/RegexTokenizer.html | 8 +-- .../nlp/annotators/SingleDatePolicy$.html | 8 +-- .../johnsnowlabs/nlp/annotators/Stemmer$.html | 8 +-- .../johnsnowlabs/nlp/annotators/Stemmer.html | 8 +-- .../nlp/annotators/StopWordsCleaner$.html | 8 +-- .../nlp/annotators/StopWordsCleaner.html | 8 +-- .../nlp/annotators/TextMatcher$.html | 8 +-- .../nlp/annotators/TextMatcher.html | 8 +-- .../nlp/annotators/TextMatcherModel$.html | 8 +-- .../nlp/annotators/TextMatcherModel.html | 8 +-- .../nlp/annotators/Token2Chunk$.html | 8 +-- .../nlp/annotators/Token2Chunk.html | 8 +-- .../nlp/annotators/Tokenizer$.html | 8 +-- .../nlp/annotators/Tokenizer.html | 8 +-- .../nlp/annotators/TokenizerModel$.html | 8 +-- .../nlp/annotators/TokenizerModel.html | 8 +-- .../nlp/annotators/audio/HubertForCTC$.html | 8 +-- .../nlp/annotators/audio/HubertForCTC.html | 8 +-- .../audio/ReadHubertForAudioDLModel.html | 8 +-- .../audio/ReadWav2Vec2ForAudioDLModel.html | 8 +-- .../audio/ReadWhisperForCTCDLModel.html | 8 +-- ...ReadablePretrainedHubertForAudioModel.html | 8 +-- ...adablePretrainedWav2Vec2ForAudioModel.html | 8 +-- .../ReadablePretrainedWhisperForCTCModel.html | 8 +-- .../nlp/annotators/audio/Wav2Vec2ForCTC$.html | 8 +-- .../nlp/annotators/audio/Wav2Vec2ForCTC.html | 8 +-- .../nlp/annotators/audio/WhisperForCTC$.html | 8 +-- .../nlp/annotators/audio/WhisperForCTC.html | 8 +-- .../audio/feature_extractor/AudioUtils$.html | 8 +-- .../PreprocessorAttributes$.html | 8 +-- .../WhisperPreprocessor.html | 8 +-- .../audio/feature_extractor/index.html | 8 +-- .../nlp/annotators/audio/index.html | 8 +-- .../nlp/annotators/btm/BigTextMatcher$.html | 8 +-- .../nlp/annotators/btm/BigTextMatcher.html | 8 +-- .../annotators/btm/BigTextMatcherModel$.html | 8 +-- .../annotators/btm/BigTextMatcherModel.html | 8 +-- .../btm/ReadablePretrainedBigTextMatcher.html | 8 +-- .../nlp/annotators/btm/TMEdgesReadWriter.html | 8 +-- .../nlp/annotators/btm/TMEdgesReader.html | 8 +-- .../nlp/annotators/btm/TMNodesReader.html | 8 +-- .../nlp/annotators/btm/TMNodesWriter.html | 8 +-- .../nlp/annotators/btm/TMVocabReadWriter.html | 8 +-- .../nlp/annotators/btm/TMVocabReader.html | 8 +-- .../nlp/annotators/btm/TrieNode.html | 8 +-- .../nlp/annotators/btm/index.html | 8 +-- .../dl/AlbertForQuestionAnswering$.html | 62 ++++++++++++++-- .../dl/AlbertForQuestionAnswering.html | 54 +++++++++++--- .../dl/AlbertForSequenceClassification$.html | 62 ++++++++++++++-- .../dl/AlbertForSequenceClassification.html | 54 +++++++++++--- .../dl/AlbertForTokenClassification$.html | 62 ++++++++++++++-- .../dl/AlbertForTokenClassification.html | 54 +++++++++++--- .../dl/BartForZeroShotClassification$.html | 8 +-- .../dl/BartForZeroShotClassification.html | 8 +-- .../dl/BertForQuestionAnswering$.html | 8 +-- .../dl/BertForQuestionAnswering.html | 8 +-- .../dl/BertForSequenceClassification$.html | 8 +-- .../dl/BertForSequenceClassification.html | 8 +-- .../dl/BertForTokenClassification$.html | 8 +-- .../dl/BertForTokenClassification.html | 8 +-- .../dl/BertForZeroShotClassification$.html | 8 +-- .../dl/BertForZeroShotClassification.html | 8 +-- .../dl/CamemBertForQuestionAnswering$.html | 8 +-- .../dl/CamemBertForQuestionAnswering.html | 8 +-- .../CamemBertForSequenceClassification$.html | 8 +-- .../CamemBertForSequenceClassification.html | 8 +-- .../dl/CamemBertForTokenClassification$.html | 8 +-- .../dl/CamemBertForTokenClassification.html | 8 +-- .../classifier/dl/ClassifierDLApproach$.html | 8 +-- .../classifier/dl/ClassifierDLApproach.html | 8 +-- .../classifier/dl/ClassifierDLModel$.html | 8 +-- .../classifier/dl/ClassifierDLModel.html | 8 +-- .../classifier/dl/ClassifierEncoder.html | 8 +-- .../classifier/dl/ClassifierMetrics.html | 8 +-- .../dl/DeBertaForQuestionAnswering$.html | 8 +-- .../dl/DeBertaForQuestionAnswering.html | 8 +-- .../dl/DeBertaForSequenceClassification$.html | 8 +-- .../dl/DeBertaForSequenceClassification.html | 8 +-- .../dl/DeBertaForTokenClassification$.html | 8 +-- .../dl/DeBertaForTokenClassification.html | 8 +-- .../dl/DistilBertForQuestionAnswering$.html | 8 +-- .../dl/DistilBertForQuestionAnswering.html | 8 +-- .../DistilBertForSequenceClassification$.html | 8 +-- .../DistilBertForSequenceClassification.html | 8 +-- .../dl/DistilBertForTokenClassification$.html | 8 +-- .../dl/DistilBertForTokenClassification.html | 8 +-- .../DistilBertForZeroShotClassification$.html | 8 +-- .../DistilBertForZeroShotClassification.html | 8 +-- .../dl/LongformerForQuestionAnswering$.html | 8 +-- .../dl/LongformerForQuestionAnswering.html | 8 +-- .../LongformerForSequenceClassification$.html | 8 +-- .../LongformerForSequenceClassification.html | 8 +-- .../dl/LongformerForTokenClassification$.html | 8 +-- .../dl/LongformerForTokenClassification.html | 8 +-- .../dl/MultiClassifierDLApproach.html | 8 +-- .../dl/MultiClassifierDLModel$.html | 8 +-- .../classifier/dl/MultiClassifierDLModel.html | 8 +-- ...ReadAlbertForQuestionAnsweringDLModel.html | 64 +++++++++++++++-- .../dl/ReadAlbertForSequenceDLModel.html | 64 +++++++++++++++-- .../dl/ReadAlbertForTokenDLModel.html | 64 +++++++++++++++-- .../dl/ReadBartForZeroShotDLModel.html | 8 +-- .../ReadBertForQuestionAnsweringDLModel.html | 8 +-- .../dl/ReadBertForSequenceDLModel.html | 8 +-- .../dl/ReadBertForTokenDLModel.html | 8 +-- .../dl/ReadBertForZeroShotDLModel.html | 8 +-- .../dl/ReadCamemBertForQADLModel.html | 8 +-- .../dl/ReadCamemBertForSequenceDLModel.html | 8 +-- .../dl/ReadCamemBertForTokenDLModel.html | 8 +-- .../dl/ReadClassifierDLTensorflowModel.html | 8 +-- ...eadDeBertaForQuestionAnsweringDLModel.html | 8 +-- .../dl/ReadDeBertaForSequenceDLModel.html | 8 +-- .../dl/ReadDeBertaForTokenDLModel.html | 8 +-- ...DistilBertForQuestionAnsweringDLModel.html | 8 +-- .../dl/ReadDistilBertForSequenceDLModel.html | 8 +-- .../dl/ReadDistilBertForTokenDLModel.html | 8 +-- .../dl/ReadDistilBertForZeroShotDLModel.html | 8 +-- ...LongformerForQuestionAnsweringDLModel.html | 8 +-- .../dl/ReadLongformerForSequenceDLModel.html | 8 +-- .../dl/ReadLongformerForTokenDLModel.html | 8 +-- .../ReadMultiClassifierDLTensorflowModel.html | 8 +-- ...eadRoBertaForQuestionAnsweringDLModel.html | 8 +-- .../dl/ReadRoBertaForSequenceDLModel.html | 8 +-- .../dl/ReadRoBertaForTokenDLModel.html | 8 +-- .../dl/ReadRoBertaForZeroShotDLModel.html | 8 +-- .../dl/ReadSentimentDLTensorflowModel.html | 8 +-- .../ReadTapasForQuestionAnsweringDLModel.html | 8 +-- ...XlmRoBertaForQuestionAnsweringDLModel.html | 8 +-- .../dl/ReadXlmRoBertaForSequenceDLModel.html | 8 +-- .../dl/ReadXlmRoBertaForTokenDLModel.html | 8 +-- .../dl/ReadXlmRoBertaForZeroShotDLModel.html | 8 +-- .../dl/ReadXlnetForSequenceDLModel.html | 8 +-- .../dl/ReadXlnetForTokenDLModel.html | 8 +-- .../ReadablePretrainedAlbertForQAModel.html | 8 +-- ...dablePretrainedAlbertForSequenceModel.html | 8 +-- ...ReadablePretrainedAlbertForTokenModel.html | 8 +-- ...eadablePretrainedBartForZeroShotModel.html | 8 +-- .../dl/ReadablePretrainedBertForQAModel.html | 8 +-- ...eadablePretrainedBertForSequenceModel.html | 8 +-- .../ReadablePretrainedBertForTokenModel.html | 8 +-- ...eadablePretrainedBertForZeroShotModel.html | 8 +-- ...ReadablePretrainedCamemBertForQAModel.html | 8 +-- ...lePretrainedCamemBertForSequenceModel.html | 8 +-- ...dablePretrainedCamemBertForTokenModel.html | 8 +-- .../dl/ReadablePretrainedClassifierDL.html | 8 +-- .../ReadablePretrainedDeBertaForQAModel.html | 8 +-- ...ablePretrainedDeBertaForSequenceModel.html | 8 +-- ...eadablePretrainedDeBertaForTokenModel.html | 8 +-- ...eadablePretrainedDistilBertForQAModel.html | 8 +-- ...ePretrainedDistilBertForSequenceModel.html | 8 +-- ...ablePretrainedDistilBertForTokenModel.html | 8 +-- ...ePretrainedDistilBertForZeroShotModel.html | 8 +-- ...eadablePretrainedLongformerForQAModel.html | 8 +-- ...ePretrainedLongformerForSequenceModel.html | 8 +-- ...ablePretrainedLongformerForTokenModel.html | 8 +-- .../ReadablePretrainedMultiClassifierDL.html | 8 +-- .../ReadablePretrainedRoBertaForQAModel.html | 8 +-- ...ablePretrainedRoBertaForSequenceModel.html | 8 +-- ...eadablePretrainedRoBertaForTokenModel.html | 8 +-- ...ablePretrainedRoBertaForZeroShotModel.html | 8 +-- .../dl/ReadablePretrainedSentimentDL.html | 8 +-- .../dl/ReadablePretrainedTapasForQAModel.html | 8 +-- ...eadablePretrainedXlmRoBertaForQAModel.html | 8 +-- ...ePretrainedXlmRoBertaForSequenceModel.html | 8 +-- ...ablePretrainedXlmRoBertaForTokenModel.html | 8 +-- ...ePretrainedXlmRoBertaForZeroShotModel.html | 8 +-- ...adablePretrainedXlnetForSequenceModel.html | 8 +-- .../ReadablePretrainedXlnetForTokenModel.html | 8 +-- .../dl/RoBertaForQuestionAnswering$.html | 8 +-- .../dl/RoBertaForQuestionAnswering.html | 8 +-- .../dl/RoBertaForSequenceClassification$.html | 8 +-- .../dl/RoBertaForSequenceClassification.html | 8 +-- .../dl/RoBertaForTokenClassification$.html | 8 +-- .../dl/RoBertaForTokenClassification.html | 8 +-- .../dl/RoBertaForZeroShotClassification$.html | 8 +-- .../dl/RoBertaForZeroShotClassification.html | 8 +-- .../classifier/dl/SentimentApproach$.html | 8 +-- .../classifier/dl/SentimentDLApproach.html | 8 +-- .../classifier/dl/SentimentDLModel$.html | 8 +-- .../classifier/dl/SentimentDLModel.html | 8 +-- .../dl/TapasForQuestionAnswering$.html | 8 +-- .../dl/TapasForQuestionAnswering.html | 8 +-- .../dl/XlmRoBertaForQuestionAnswering$.html | 8 +-- .../dl/XlmRoBertaForQuestionAnswering.html | 8 +-- .../XlmRoBertaForSequenceClassification$.html | 8 +-- .../XlmRoBertaForSequenceClassification.html | 8 +-- .../dl/XlmRoBertaForTokenClassification$.html | 8 +-- .../dl/XlmRoBertaForTokenClassification.html | 8 +-- .../XlmRoBertaForZeroShotClassification$.html | 8 +-- .../XlmRoBertaForZeroShotClassification.html | 8 +-- .../dl/XlnetForSequenceClassification$.html | 8 +-- .../dl/XlnetForSequenceClassification.html | 8 +-- .../dl/XlnetForTokenClassification$.html | 8 +-- .../dl/XlnetForTokenClassification.html | 8 +-- .../nlp/annotators/classifier/dl/index.html | 32 ++++----- .../nlp/annotators/classifier/index.html | 8 +-- .../nlp/annotators/common/Annotated$.html | 8 +-- .../nlp/annotators/common/Annotated.html | 8 +-- .../nlp/annotators/common/ChunkSplit$.html | 8 +-- .../nlp/annotators/common/ConllSentence.html | 8 +-- .../DatasetHelpers$$DataFrameHelper.html | 8 +-- .../annotators/common/DatasetHelpers$.html | 8 +-- .../annotators/common/DependencyParsed$.html | 8 +-- .../common/DependencyParsedSentence.html | 8 +-- .../common/EmbeddingsWithSentence$.html | 8 +-- .../annotators/common/IndexedTaggedWord.html | 8 +-- .../nlp/annotators/common/IndexedToken.html | 8 +-- .../nlp/annotators/common/InfixToken$.html | 8 +-- .../nlp/annotators/common/InfixToken.html | 8 +-- .../LabeledDependency$$DependencyInfo.html | 8 +-- .../annotators/common/LabeledDependency$.html | 8 +-- .../nlp/annotators/common/NerTagged$.html | 8 +-- .../nlp/annotators/common/PosTagged$.html | 8 +-- .../nlp/annotators/common/PrefixedToken$.html | 8 +-- .../nlp/annotators/common/PrefixedToken.html | 8 +-- .../common/PreprocessingParser.html | 8 +-- .../nlp/annotators/common/Sentence$.html | 8 +-- .../nlp/annotators/common/Sentence.html | 8 +-- .../nlp/annotators/common/SentenceSplit$.html | 8 +-- .../nlp/annotators/common/SuffixedToken$.html | 8 +-- .../nlp/annotators/common/SuffixedToken.html | 8 +-- .../nlp/annotators/common/TableData$.html | 8 +-- .../nlp/annotators/common/TableData.html | 8 +-- .../nlp/annotators/common/Tagged.html | 8 +-- .../annotators/common/TaggedSentence$.html | 8 +-- .../nlp/annotators/common/TaggedSentence.html | 8 +-- .../nlp/annotators/common/TaggedWord.html | 8 +-- .../nlp/annotators/common/TokenPiece.html | 8 +-- .../common/TokenPieceEmbeddings$.html | 8 +-- .../common/TokenPieceEmbeddings.html | 8 +-- .../annotators/common/TokenizedSentence.html | 8 +-- .../common/TokenizedWithSentence$.html | 8 +-- .../annotators/common/WordWithDependency.html | 8 +-- .../common/WordpieceEmbeddingsSentence$.html | 8 +-- .../common/WordpieceEmbeddingsSentence.html | 8 +-- .../common/WordpieceTokenized$.html | 8 +-- .../common/WordpieceTokenizedSentence.html | 8 +-- .../nlp/annotators/common/index.html | 8 +-- .../ReadSpanBertCorefTensorflowModel.html | 8 +-- .../ReadablePretrainedSpanBertCorefModel.html | 8 +-- .../annotators/coref/SpanBertCorefModel$.html | 8 +-- .../annotators/coref/SpanBertCorefModel.html | 8 +-- .../nlp/annotators/coref/index.html | 8 +-- .../cv/ConvNextForImageClassification$.html | 8 +-- .../cv/ConvNextForImageClassification.html | 8 +-- .../cv/ReadConvNextForImageDLModel.html | 8 +-- .../cv/ReadSwinForImageDLModel.html | 8 +-- .../annotators/cv/ReadViTForImageDLModel.html | 8 +-- ...adablePretrainedConvNextForImageModel.html | 8 +-- .../ReadablePretrainedSwinForImageModel.html | 8 +-- .../ReadablePretrainedViTForImageModel.html | 8 +-- .../cv/SwinForImageClassification$.html | 8 +-- .../cv/SwinForImageClassification.html | 8 +-- .../cv/ViTForImageClassification$.html | 8 +-- .../cv/ViTForImageClassification.html | 8 +-- .../johnsnowlabs/nlp/annotators/cv/index.html | 8 +-- .../er/AhoCorasickAutomaton$Node.html | 8 +-- .../annotators/er/AhoCorasickAutomaton.html | 8 +-- .../nlp/annotators/er/EntityPattern.html | 8 +-- .../annotators/er/EntityRulerApproach.html | 8 +-- .../annotators/er/EntityRulerFeatures.html | 8 +-- .../nlp/annotators/er/EntityRulerModel$.html | 8 +-- .../nlp/annotators/er/EntityRulerModel.html | 8 +-- .../nlp/annotators/er/EntityRulerUtil$.html | 8 +-- .../annotators/er/FlattenEntityPattern.html | 8 +-- .../nlp/annotators/er/PatternsReadWriter.html | 8 +-- .../nlp/annotators/er/PatternsReader.html | 8 +-- .../er/ReadablePretrainedEntityRuler.html | 8 +-- .../er/RegexPatternsReadWriter.html | 8 +-- .../annotators/er/RegexPatternsReader.html | 8 +-- .../johnsnowlabs/nlp/annotators/er/index.html | 8 +-- .../johnsnowlabs/nlp/annotators/index.html | 8 +-- .../nlp/annotators/keyword/index.html | 8 +-- .../keyword/yake/YakeKeywordExtraction$.html | 8 +-- .../keyword/yake/YakeKeywordExtraction.html | 8 +-- .../annotators/keyword/yake/YakeParams.html | 8 +-- .../nlp/annotators/keyword/yake/index.html | 8 +-- .../annotators/keyword/yake/util/Token.html | 8 +-- .../keyword/yake/util/Utilities$.html | 8 +-- .../annotators/keyword/yake/util/index.html | 8 +-- .../annotators/ld/dl/LanguageDetectorDL$.html | 8 +-- .../annotators/ld/dl/LanguageDetectorDL.html | 8 +-- ...ReadLanguageDetectorDLTensorflowModel.html | 8 +-- ...ablePretrainedLanguageDetectorDLModel.html | 8 +-- .../nlp/annotators/ld/dl/index.html | 8 +-- .../johnsnowlabs/nlp/annotators/ld/index.html | 8 +-- .../nlp/annotators/ner/ModelMetrics$.html | 8 +-- .../nlp/annotators/ner/NamedEntity.html | 8 +-- .../nlp/annotators/ner/NerApproach.html | 8 +-- .../nlp/annotators/ner/NerConverter$.html | 8 +-- .../nlp/annotators/ner/NerConverter.html | 8 +-- .../nlp/annotators/ner/NerOverwriter$.html | 8 +-- .../nlp/annotators/ner/NerOverwriter.html | 8 +-- .../nlp/annotators/ner/NerTagsEncoding$.html | 8 +-- .../nlp/annotators/ner/Verbose$.html | 8 +-- .../ner/crf/DictionaryFeatures$.html | 8 +-- .../ner/crf/DictionaryFeatures.html | 8 +-- .../ner/crf/FeatureGenerator$TokenType$.html | 8 +-- .../annotators/ner/crf/FeatureGenerator.html | 8 +-- .../annotators/ner/crf/NerCrfApproach$.html | 8 +-- .../annotators/ner/crf/NerCrfApproach.html | 8 +-- .../nlp/annotators/ner/crf/NerCrfModel$.html | 8 +-- .../nlp/annotators/ner/crf/NerCrfModel.html | 8 +-- .../ner/crf/ReadablePretrainedNerCrf.html | 8 +-- .../nlp/annotators/ner/crf/index.html | 8 +-- .../nlp/annotators/ner/dl/LoadsContrib$.html | 8 +-- .../nlp/annotators/ner/dl/NerDLApproach$.html | 8 +-- .../nlp/annotators/ner/dl/NerDLApproach.html | 8 +-- .../nlp/annotators/ner/dl/NerDLModel$.html | 8 +-- .../nlp/annotators/ner/dl/NerDLModel.html | 8 +-- .../ner/dl/NerDLModelPythonReader$.html | 8 +-- .../ner/dl/ReadZeroShotNerDLModel.html | 8 +-- .../ner/dl/ReadablePretrainedNerDL.html | 8 +-- .../ner/dl/ReadablePretrainedZeroShotNer.html | 8 +-- .../nlp/annotators/ner/dl/ReadsNERGraph.html | 8 +-- .../annotators/ner/dl/WithGraphResolver.html | 8 +-- .../annotators/ner/dl/ZeroShotNerModel$.html | 8 +-- .../annotators/ner/dl/ZeroShotNerModel.html | 8 +-- .../nlp/annotators/ner/dl/index.html | 8 +-- .../nlp/annotators/ner/index.html | 8 +-- ...lizableFormat$$SerializableDateFormat.html | 8 +-- .../AnnotatorParam$SerializableFormat$.html | 8 +-- .../nlp/annotators/param/AnnotatorParam.html | 8 +-- .../annotators/param/EvaluationDLParams.html | 8 +-- .../param/ExternalResourceParam.html | 8 +-- .../param/SerializedAnnotatorComponent.html | 8 +-- .../param/WritableAnnotatorComponent.html | 8 +-- .../nlp/annotators/param/index.html | 8 +-- .../parser/dep/DependencyParserApproach$.html | 8 +-- .../parser/dep/DependencyParserApproach.html | 8 +-- .../parser/dep/DependencyParserModel$.html | 8 +-- .../parser/dep/DependencyParserModel.html | 8 +-- .../GreedyTransition/DependencyMaker$.html | 8 +-- .../DependencyMaker$CurrentState.html | 8 +-- .../DependencyMaker$ParseState.html | 8 +-- .../dep/GreedyTransition/DependencyMaker.html | 8 +-- .../GreedyTransitionApproach$.html | 8 +-- .../parser/dep/GreedyTransition/index.html | 8 +-- .../GreedyTransition/package$$Feature.html | 8 +-- .../GreedyTransition/package$$WordData.html | 8 +-- .../parser/dep/Perceptron$WeightLearner.html | 8 +-- .../nlp/annotators/parser/dep/Perceptron.html | 8 +-- .../dep/ReadablePretrainedDependency.html | 8 +-- .../annotators/parser/dep/TagDictionary$.html | 8 +-- .../nlp/annotators/parser/dep/Tagger$.html | 8 +-- .../nlp/annotators/parser/dep/Tagger.html | 8 +-- .../nlp/annotators/parser/dep/index.html | 8 +-- .../nlp/annotators/parser/index.html | 8 +-- .../annotators/parser/typdep/ConllData.html | 8 +-- .../parser/typdep/DependencyArcList.html | 8 +-- .../parser/typdep/DependencyInstance.html | 8 +-- .../parser/typdep/DependencyPipe.html | 8 +-- .../parser/typdep/LocalFeatureData.html | 8 +-- .../parser/typdep/LowRankTensor.html | 8 +-- .../nlp/annotators/parser/typdep/Options.html | 8 +-- .../annotators/parser/typdep/Parameters.html | 8 +-- .../parser/typdep/PredictionParameters.html | 8 +-- .../ReadablePretrainedTypedDependency.html | 8 +-- .../parser/typdep/TrainDependencies.html | 8 +-- .../annotators/parser/typdep/TrainFile.html | 8 +-- .../parser/typdep/TypedDependencyParser.html | 8 +-- .../TypedDependencyParserApproach$.html | 8 +-- .../typdep/TypedDependencyParserApproach.html | 8 +-- .../typdep/TypedDependencyParserModel$.html | 8 +-- .../typdep/TypedDependencyParserModel.html | 8 +-- .../typdep/feature/FeatureTemplate.html | 8 +-- .../feature/SyntacticFeatureFactory.html | 8 +-- .../parser/typdep/feature/index.html | 8 +-- .../nlp/annotators/parser/typdep/index.html | 8 +-- .../parser/typdep/io/Conll09Reader.html | 8 +-- .../parser/typdep/io/ConllUReader.html | 8 +-- .../parser/typdep/io/ConllWriter.html | 8 +-- .../parser/typdep/io/DependencyReader.html | 8 +-- .../annotators/parser/typdep/io/index.html | 8 +-- .../parser/typdep/util/Alphabet.html | 8 +-- .../parser/typdep/util/Collector.html | 8 +-- .../parser/typdep/util/DependencyLabel.html | 8 +-- .../parser/typdep/util/Dictionary.html | 8 +-- .../parser/typdep/util/DictionarySet.html | 8 +-- .../parser/typdep/util/FeatureVector.html | 8 +-- .../parser/typdep/util/ScoreCollector.html | 8 +-- .../annotators/parser/typdep/util/Utils.html | 8 +-- .../annotators/parser/typdep/util/index.html | 8 +-- .../nlp/annotators/pos/index.html | 8 +-- .../pos/perceptron/AveragedPerceptron.html | 8 +-- .../pos/perceptron/PerceptronApproach$.html | 8 +-- .../pos/perceptron/PerceptronApproach.html | 8 +-- .../PerceptronApproachDistributed$.html | 8 +-- .../PerceptronApproachDistributed.html | 8 +-- .../pos/perceptron/PerceptronModel$.html | 8 +-- .../pos/perceptron/PerceptronModel.html | 8 +-- .../perceptron/PerceptronPredictionUtils.html | 8 +-- .../perceptron/PerceptronTrainingUtils.html | 8 +-- .../pos/perceptron/PerceptronUtils.html | 8 +-- .../ReadablePretrainedPerceptron.html | 8 +-- .../StringMapStringDoubleAccumulator.html | 8 +-- .../perceptron/TrainingPerceptronLegacy.html | 8 +-- .../TupleKeyLongDoubleMapAccumulator.html | 8 +-- .../nlp/annotators/pos/perceptron/index.html | 8 +-- .../sbd/SentenceDetectorParams.html | 8 +-- .../nlp/annotators/sbd/index.html | 8 +-- .../sbd/pragmatic/CustomPragmaticMethod.html | 8 +-- .../sbd/pragmatic/DefaultPragmaticMethod.html | 8 +-- .../sbd/pragmatic/MixedPragmaticMethod.html | 8 +-- .../pragmatic/PragmaticContentFormatter$.html | 8 +-- .../pragmatic/PragmaticContentFormatter.html | 8 +-- .../sbd/pragmatic/PragmaticDictionaries$.html | 8 +-- .../sbd/pragmatic/PragmaticMethod.html | 8 +-- .../pragmatic/PragmaticSentenceExtractor.html | 8 +-- .../sbd/pragmatic/PragmaticSymbols$.html | 8 +-- .../annotators/sbd/pragmatic/RuleSymbols.html | 8 +-- .../sbd/pragmatic/SentenceDetector$.html | 8 +-- .../sbd/pragmatic/SentenceDetector.html | 8 +-- .../nlp/annotators/sbd/pragmatic/index.html | 8 +-- .../nlp/annotators/sda/index.html | 8 +-- .../sda/pragmatic/PragmaticScorer.html | 8 +-- .../sda/pragmatic/SentimentDetector$.html | 8 +-- .../sda/pragmatic/SentimentDetector.html | 8 +-- .../pragmatic/SentimentDetectorModel$.html | 8 +-- .../sda/pragmatic/SentimentDetectorModel.html | 8 +-- .../nlp/annotators/sda/pragmatic/index.html | 8 +-- .../sda/vivekn/ReadablePretrainedVivekn.html | 8 +-- .../sda/vivekn/ViveknSentimentApproach.html | 8 +-- .../sda/vivekn/ViveknSentimentModel$.html | 8 +-- .../sda/vivekn/ViveknSentimentModel.html | 8 +-- .../sda/vivekn/ViveknSentimentUtils.html | 8 +-- .../nlp/annotators/sda/vivekn/index.html | 8 +-- .../sentence_detector_dl/Metrics.html | 8 +-- .../ReadablePretrainedSentenceDetectorDL.html | 8 +-- .../ReadsSentenceDetectorDLGraph.html | 8 +-- .../SentenceDetectorDLApproach.html | 8 +-- .../SentenceDetectorDLEncoder$.html | 8 +-- .../SentenceDetectorDLEncoder.html | 8 +-- .../SentenceDetectorDLEncoderParam.html | 8 +-- .../SentenceDetectorDLModel$.html | 8 +-- .../SentenceDetectorDLModel.html | 8 +-- .../sentence_detector_dl/index.html | 8 +-- .../annotators/seq2seq/BartTransformer$.html | 8 +-- .../annotators/seq2seq/BartTransformer.html | 8 +-- .../annotators/seq2seq/GPT2Transformer$.html | 8 +-- .../annotators/seq2seq/GPT2Transformer.html | 8 +-- .../seq2seq/MarianTransformer$.html | 8 +-- .../annotators/seq2seq/MarianTransformer.html | 8 +-- .../seq2seq/ReadBartTransformerDLModel.html | 8 +-- .../seq2seq/ReadGPT2TransformerDLModel.html | 8 +-- .../seq2seq/ReadMarianMTDLModel.html | 8 +-- .../seq2seq/ReadT5TransformerDLModel.html | 8 +-- ...eadablePretrainedBartTransformerModel.html | 8 +-- ...eadablePretrainedGPT2TransformerModel.html | 8 +-- .../ReadablePretrainedMarianMTModel.html | 8 +-- .../ReadablePretrainedT5TransformerModel.html | 8 +-- .../annotators/seq2seq/T5Transformer$.html | 8 +-- .../nlp/annotators/seq2seq/T5Transformer.html | 8 +-- .../nlp/annotators/seq2seq/index.html | 8 +-- .../DocumentSimilarityRankerApproach$.html | 8 +-- .../DocumentSimilarityRankerApproach.html | 8 +-- .../DocumentSimilarityRankerModel$.html | 8 +-- .../DocumentSimilarityRankerModel.html | 8 +-- .../similarity/IndexedNeighbors.html | 8 +-- .../IndexedNeighborsWithDistance.html | 8 +-- .../similarity/NeighborAnnotation.html | 8 +-- .../similarity/NeighborsResultSet.html | 8 +-- .../ReadableDocumentSimilarityRanker.html | 8 +-- .../nlp/annotators/similarity/index.html | 8 +-- .../spell/context/CandidateStrategy$.html | 8 +-- ...ntextSpellCheckerApproach$ArrayHelper.html | 8 +-- .../context/ContextSpellCheckerApproach.html | 8 +-- .../context/ContextSpellCheckerModel$.html | 8 +-- .../ContextSpellCheckerModel$StringTools.html | 8 +-- .../context/ContextSpellCheckerModel.html | 8 +-- .../spell/context/HasTransducerFeatures.html | 8 +-- .../spell/context/LangModelSentence.html | 8 +-- .../ReadablePretrainedContextSpell.html | 8 +-- .../context/ReadsLanguageModelGraph.html | 8 +-- .../spell/context/WeightedLevenshtein.html | 8 +-- .../nlp/annotators/spell/context/index.html | 8 +-- .../spell/context/parser/AgeToken.html | 8 +-- .../spell/context/parser/DateToken.html | 8 +-- .../context/parser/GenericRegexParser.html | 8 +-- .../context/parser/GenericVocabParser.html | 8 +-- .../spell/context/parser/LocationClass.html | 8 +-- .../spell/context/parser/MainVocab.html | 8 +-- .../spell/context/parser/MedicationClass.html | 8 +-- .../spell/context/parser/NamesClass.html | 8 +-- .../spell/context/parser/NumberToken.html | 8 +-- .../spell/context/parser/RegexParser.html | 8 +-- .../context/parser/SerializableClass.html | 8 +-- .../context/parser/SpecialClassParser.html | 8 +-- .../context/parser/TransducerSeqFeature.html | 8 +-- .../spell/context/parser/UnitToken.html | 8 +-- .../spell/context/parser/VocabParser.html | 8 +-- .../spell/context/parser/index.html | 8 +-- .../nlp/annotators/spell/index.html | 8 +-- .../spell/norvig/NorvigSweetingApproach$.html | 8 +-- .../spell/norvig/NorvigSweetingApproach.html | 8 +-- .../spell/norvig/NorvigSweetingModel$.html | 8 +-- .../spell/norvig/NorvigSweetingModel.html | 8 +-- .../spell/norvig/NorvigSweetingParams.html | 8 +-- .../norvig/ReadablePretrainedNorvig.html | 8 +-- .../nlp/annotators/spell/norvig/index.html | 8 +-- .../ReadablePretrainedSymmetric.html | 8 +-- .../symmetric/SymmetricDeleteApproach$.html | 8 +-- .../symmetric/SymmetricDeleteApproach.html | 8 +-- .../symmetric/SymmetricDeleteModel$.html | 8 +-- .../SymmetricDeleteModel$SuggestedWord.html | 8 +-- .../spell/symmetric/SymmetricDeleteModel.html | 8 +-- .../symmetric/SymmetricDeleteParams.html | 8 +-- .../nlp/annotators/spell/symmetric/index.html | 8 +-- .../nlp/annotators/spell/util/Utilities$.html | 8 +-- .../nlp/annotators/spell/util/index.html | 8 +-- .../nlp/annotators/tapas/TapasCellDate$.html | 8 +-- .../nlp/annotators/tapas/TapasCellDate.html | 8 +-- .../nlp/annotators/tapas/TapasCellValue$.html | 8 +-- .../nlp/annotators/tapas/TapasCellValue.html | 8 +-- .../nlp/annotators/tapas/TapasEncoder.html | 8 +-- .../nlp/annotators/tapas/TapasInputData.html | 8 +-- .../tapas/TapasNumericRelation$.html | 8 +-- .../tapas/TapasNumericValueSpan$.html | 8 +-- .../tapas/TapasNumericValueSpan.html | 8 +-- .../nlp/annotators/tapas/index.html | 8 +-- .../tokenizer/bpe/BartTokenizer.html | 8 +-- .../tokenizer/bpe/BpeTokenizer$.html | 8 +-- .../tokenizer/bpe/Gpt2Tokenizer.html | 8 +-- .../tokenizer/bpe/RobertaTokenizer.html | 8 +-- .../tokenizer/bpe/SpecialToken.html | 8 +-- .../tokenizer/bpe/WhisperTokenDecoder.html | 8 +-- .../nlp/annotators/tokenizer/bpe/index.html | 8 +-- .../nlp/annotators/tokenizer/index.html | 8 +-- .../ws/ReadablePretrainedWordSegmenter.html | 8 +-- .../nlp/annotators/ws/TagsType$.html | 8 +-- .../annotators/ws/WordSegmenterApproach$.html | 8 +-- .../annotators/ws/WordSegmenterApproach.html | 8 +-- .../annotators/ws/WordSegmenterModel$.html | 8 +-- .../nlp/annotators/ws/WordSegmenterModel.html | 8 +-- .../johnsnowlabs/nlp/annotators/ws/index.html | 8 +-- .../nlp/embeddings/AlbertEmbeddings$.html | 8 +-- .../nlp/embeddings/AlbertEmbeddings.html | 8 +-- .../nlp/embeddings/BertEmbeddings$.html | 8 +-- .../nlp/embeddings/BertEmbeddings.html | 8 +-- .../embeddings/BertSentenceEmbeddings$.html | 8 +-- .../embeddings/BertSentenceEmbeddings.html | 8 +-- .../nlp/embeddings/CamemBertEmbeddings$.html | 8 +-- .../nlp/embeddings/CamemBertEmbeddings.html | 8 +-- .../nlp/embeddings/ChunkEmbeddings$.html | 8 +-- .../nlp/embeddings/ChunkEmbeddings.html | 8 +-- .../nlp/embeddings/DeBertaEmbeddings$.html | 8 +-- .../nlp/embeddings/DeBertaEmbeddings.html | 8 +-- .../nlp/embeddings/DistilBertEmbeddings$.html | 8 +-- .../nlp/embeddings/DistilBertEmbeddings.html | 8 +-- .../nlp/embeddings/Doc2VecApproach$.html | 8 +-- .../nlp/embeddings/Doc2VecApproach.html | 8 +-- .../nlp/embeddings/Doc2VecModel$.html | 8 +-- .../nlp/embeddings/Doc2VecModel.html | 28 ++++++-- .../nlp/embeddings/E5Embeddings$.html | 8 +-- .../nlp/embeddings/E5Embeddings.html | 8 +-- .../nlp/embeddings/ElmoEmbeddings$.html | 8 +-- .../nlp/embeddings/ElmoEmbeddings.html | 8 +-- .../EmbeddingsCoverage$CoverageResult.html | 8 +-- .../nlp/embeddings/EmbeddingsCoverage.html | 8 +-- .../embeddings/HasEmbeddingsProperties.html | 8 +-- .../nlp/embeddings/InstructorEmbeddings$.html | 8 +-- .../nlp/embeddings/InstructorEmbeddings.html | 8 +-- .../nlp/embeddings/LongformerEmbeddings$.html | 8 +-- .../nlp/embeddings/LongformerEmbeddings.html | 8 +-- .../nlp/embeddings/MPNetEmbeddings$.html | 62 ++++++++++++++-- .../nlp/embeddings/MPNetEmbeddings.html | 54 +++++++++++--- .../PoolingStrategy$$AnnotatorType$.html | 8 +-- .../nlp/embeddings/PoolingStrategy$.html | 8 +-- .../nlp/embeddings/ReadAlbertDLModel.html | 8 +-- .../nlp/embeddings/ReadBertDLModel.html | 8 +-- .../embeddings/ReadBertSentenceDLModel.html | 8 +-- .../nlp/embeddings/ReadCamemBertDLModel.html | 8 +-- .../nlp/embeddings/ReadDeBertaDLModel.html | 8 +-- .../nlp/embeddings/ReadDistilBertDLModel.html | 8 +-- .../nlp/embeddings/ReadE5DLModel.html | 8 +-- .../nlp/embeddings/ReadElmoDLModel.html | 8 +-- .../nlp/embeddings/ReadInstructorDLModel.html | 8 +-- .../nlp/embeddings/ReadLongformerDLModel.html | 8 +-- .../nlp/embeddings/ReadMPNetDLModel.html | 66 +++++++++++++++--- .../nlp/embeddings/ReadRobertaDLModel.html | 8 +-- .../ReadRobertaSentenceDLModel.html | 8 +-- .../nlp/embeddings/ReadUSEDLModel.html | 8 +-- .../nlp/embeddings/ReadXlmRobertaDLModel.html | 8 +-- .../ReadXlmRobertaSentenceDLModel.html | 8 +-- .../nlp/embeddings/ReadXlnetDLModel.html | 8 +-- .../ReadablePretrainedAlbertModel.html | 8 +-- .../ReadablePretrainedBertModel.html | 8 +-- .../ReadablePretrainedBertSentenceModel.html | 8 +-- .../ReadablePretrainedCamemBertModel.html | 8 +-- .../ReadablePretrainedDeBertaModel.html | 8 +-- .../ReadablePretrainedDistilBertModel.html | 8 +-- .../embeddings/ReadablePretrainedDoc2Vec.html | 8 +-- .../embeddings/ReadablePretrainedE5Model.html | 8 +-- .../ReadablePretrainedElmoModel.html | 8 +-- .../ReadablePretrainedInstructorModel.html | 8 +-- .../ReadablePretrainedLongformerModel.html | 8 +-- .../ReadablePretrainedMPNetModel.html | 8 +-- .../ReadablePretrainedRobertaModel.html | 8 +-- ...eadablePretrainedRobertaSentenceModel.html | 8 +-- .../ReadablePretrainedUSEModel.html | 8 +-- .../ReadablePretrainedWord2Vec.html | 8 +-- .../ReadablePretrainedWordEmbeddings.html | 8 +-- .../ReadablePretrainedXlmRobertaModel.html | 8 +-- ...ablePretrainedXlmRobertaSentenceModel.html | 8 +-- .../ReadablePretrainedXlnetModel.html | 8 +-- .../nlp/embeddings/ReadsFromBytes.html | 8 +-- .../nlp/embeddings/RoBertaEmbeddings$.html | 8 +-- .../nlp/embeddings/RoBertaEmbeddings.html | 8 +-- .../RoBertaSentenceEmbeddings$.html | 8 +-- .../embeddings/RoBertaSentenceEmbeddings.html | 8 +-- .../nlp/embeddings/SentenceEmbeddings$.html | 8 +-- .../nlp/embeddings/SentenceEmbeddings.html | 8 +-- .../embeddings/UniversalSentenceEncoder$.html | 8 +-- .../embeddings/UniversalSentenceEncoder.html | 8 +-- .../nlp/embeddings/Word2VecApproach$.html | 8 +-- .../nlp/embeddings/Word2VecApproach.html | 8 +-- .../nlp/embeddings/Word2VecModel$.html | 8 +-- .../nlp/embeddings/Word2VecModel.html | 28 ++++++-- .../nlp/embeddings/WordEmbeddings$.html | 8 +-- .../nlp/embeddings/WordEmbeddings.html | 8 +-- .../WordEmbeddingsBinaryIndexer$.html | 8 +-- .../nlp/embeddings/WordEmbeddingsModel$.html | 8 +-- .../nlp/embeddings/WordEmbeddingsModel.html | 8 +-- .../nlp/embeddings/WordEmbeddingsReader.html | 8 +-- .../WordEmbeddingsTextIndexer$.html | 8 +-- .../nlp/embeddings/WordEmbeddingsWriter.html | 8 +-- .../nlp/embeddings/XlmRoBertaEmbeddings$.html | 8 +-- .../nlp/embeddings/XlmRoBertaEmbeddings.html | 8 +-- .../XlmRoBertaSentenceEmbeddings$.html | 8 +-- .../XlmRoBertaSentenceEmbeddings.html | 8 +-- .../nlp/embeddings/XlnetEmbeddings$.html | 8 +-- .../nlp/embeddings/XlnetEmbeddings.html | 8 +-- .../johnsnowlabs/nlp/embeddings/index.html | 16 ++--- .../DocumentSimilarityRankerFinisher$.html | 8 +-- .../DocumentSimilarityRankerFinisher.html | 8 +-- .../com/johnsnowlabs/nlp/finisher/index.html | 8 +-- .../nlp/functions$$EachAnnotations.html | 8 +-- .../nlp/functions$$ExplodeAnnotations.html | 8 +-- .../nlp/functions$$FilterAnnotations.html | 8 +-- .../nlp/functions$$MapAnnotations.html | 8 +-- docs/api/com/johnsnowlabs/nlp/functions$.html | 8 +-- docs/api/com/johnsnowlabs/nlp/index.html | 8 +-- .../nlp/pretrained/PretrainedPipeline$.html | 8 +-- .../nlp/pretrained/PretrainedPipeline.html | 8 +-- .../pretrained/PythonResourceDownloader$.html | 8 +-- .../nlp/pretrained/RepositoryMetadata.html | 8 +-- .../nlp/pretrained/ResourceDownloader$.html | 8 +-- .../nlp/pretrained/ResourceDownloader.html | 8 +-- .../nlp/pretrained/ResourceMetadata$.html | 8 +-- .../nlp/pretrained/ResourceMetadata.html | 8 +-- .../nlp/pretrained/ResourceRequest.html | 8 +-- .../nlp/pretrained/ResourceType$.html | 8 +-- .../nlp/pretrained/S3ResourceDownloader.html | 8 +-- .../johnsnowlabs/nlp/pretrained/index.html | 8 +-- .../com/johnsnowlabs/nlp/recursive/index.html | 8 +-- .../nlp/recursive/package$$Recursive.html | 8 +-- .../recursive/package$$RecursiveModel.html | 8 +-- .../nlp/serialization/ArrayFeature.html | 8 +-- .../nlp/serialization/Feature.html | 8 +-- .../nlp/serialization/MapFeature.html | 8 +-- .../SerializedExternalResource.html | 8 +-- .../nlp/serialization/SetFeature.html | 8 +-- .../nlp/serialization/StructFeature.html | 8 +-- .../nlp/serialization/TransducerFeature.html | 8 +-- .../johnsnowlabs/nlp/serialization/index.html | 8 +-- .../com/johnsnowlabs/nlp/training/CoNLL.html | 8 +-- .../nlp/training/CoNLL2003NerReader.html | 8 +-- .../nlp/training/CoNLLDocument.html | 8 +-- .../CoNLLHelper$$CoNLLSentenceCols.html | 8 +-- .../training/CoNLLHelper$$CoNLLTokenCols.html | 8 +-- .../nlp/training/CoNLLHelper$.html | 8 +-- .../com/johnsnowlabs/nlp/training/CoNLLU.html | 8 +-- .../nlp/training/CoNLLUCols$.html | 8 +-- .../nlp/training/CoNLLUDocument.html | 8 +-- .../com/johnsnowlabs/nlp/training/POS.html | 8 +-- .../johnsnowlabs/nlp/training/PubTator.html | 8 +-- .../nlp/training/SpacyToAnnotation.html | 8 +-- .../com/johnsnowlabs/nlp/training/index.html | 8 +-- .../johnsnowlabs/nlp/util/FinisherUtil$.html | 8 +-- .../johnsnowlabs/nlp/util/GraphBuilder.html | 8 +-- .../nlp/util/LfuCache$CachedItem.html | 8 +-- .../nlp/util/LfuCache$DoubleLinked.html | 8 +-- .../nlp/util/LfuCache$FrequencyList.html | 8 +-- .../com/johnsnowlabs/nlp/util/LfuCache.html | 8 +-- .../nlp/util/LruMap$KeyPriority.html | 8 +-- .../nlp/util/LruMap$KeyPriorityOrdering$.html | 8 +-- .../api/com/johnsnowlabs/nlp/util/LruMap.html | 8 +-- .../nlp/util/SparkNlpConfigKeys$.html | 8 +-- docs/api/com/johnsnowlabs/nlp/util/index.html | 8 +-- .../nlp/util/io/CloudStorageType$.html | 8 +-- .../nlp/util/io/ExternalResource$.html | 8 +-- .../nlp/util/io/ExternalResource.html | 8 +-- .../nlp/util/io/MatchStrategy$.html | 8 +-- .../nlp/util/io/OutputHelper$.html | 8 +-- .../com/johnsnowlabs/nlp/util/io/ReadAs$.html | 8 +-- .../util/io/ResourceHelper$$SourceStream.html | 8 +-- .../nlp/util/io/ResourceHelper$.html | 8 +-- .../com/johnsnowlabs/nlp/util/io/index.html | 8 +-- .../nlp/util/regex/RegexRule.html | 8 +-- .../util/regex/RuleFactory$$RuleMatch.html | 8 +-- .../nlp/util/regex/RuleFactory$.html | 8 +-- .../nlp/util/regex/RuleFactory.html | 8 +-- .../nlp/util/regex/TransformStrategy$.html | 8 +-- .../johnsnowlabs/nlp/util/regex/index.html | 8 +-- .../com/johnsnowlabs/storage/BytesKey.html | 8 +-- .../com/johnsnowlabs/storage/Database$.html | 8 +-- .../com/johnsnowlabs/storage/Database.html | 8 +-- .../johnsnowlabs/storage/HasConnection.html | 8 +-- .../com/johnsnowlabs/storage/HasStorage.html | 8 +-- .../johnsnowlabs/storage/HasStorageModel.html | 8 +-- .../storage/HasStorageOptions.html | 8 +-- .../storage/HasStorageReader.html | 8 +-- .../johnsnowlabs/storage/HasStorageRef$.html | 8 +-- .../johnsnowlabs/storage/HasStorageRef.html | 8 +-- .../storage/RocksDBConnection$.html | 8 +-- .../storage/RocksDBConnection.html | 8 +-- .../storage/StorageBatchWriter.html | 8 +-- .../johnsnowlabs/storage/StorageFormat.html | 8 +-- .../johnsnowlabs/storage/StorageHelper$.html | 8 +-- .../johnsnowlabs/storage/StorageLocator$.html | 8 +-- .../johnsnowlabs/storage/StorageLocator.html | 8 +-- .../storage/StorageReadWriter.html | 8 +-- .../johnsnowlabs/storage/StorageReadable.html | 8 +-- .../johnsnowlabs/storage/StorageReader.html | 8 +-- .../johnsnowlabs/storage/StorageWriter.html | 8 +-- docs/api/com/johnsnowlabs/storage/index.html | 8 +-- .../api/com/johnsnowlabs/util/Benchmark$.html | 8 +-- docs/api/com/johnsnowlabs/util/Build$.html | 8 +-- .../johnsnowlabs/util/CoNLLGenerator$.html | 8 +-- .../com/johnsnowlabs/util/ConfigHelper$.html | 8 +-- .../com/johnsnowlabs/util/ConfigLoader$.html | 8 +-- .../com/johnsnowlabs/util/FileHelper$.html | 8 +-- .../com/johnsnowlabs/util/JsonBuilder$.html | 8 +-- .../com/johnsnowlabs/util/JsonParser$.html | 8 +-- .../johnsnowlabs/util/PipelineModels$.html | 8 +-- .../johnsnowlabs/util/TrainingHelper$.html | 8 +-- docs/api/com/johnsnowlabs/util/Version$.html | 8 +-- docs/api/com/johnsnowlabs/util/Version.html | 8 +-- .../johnsnowlabs/util/ZipArchiveUtil$.html | 8 +-- docs/api/com/johnsnowlabs/util/index.html | 8 +-- .../util/spark/LongMapAccumulator.html | 8 +-- .../util/spark/MapAccumulator.html | 8 +-- .../johnsnowlabs/util/spark/SparkUtil$.html | 8 +-- .../com/johnsnowlabs/util/spark/index.html | 8 +-- docs/api/index.html | 8 +-- docs/api/index.js | 2 +- docs/api/python/.buildinfo | 2 +- docs/api/python/genindex.html | 12 +++- docs/api/python/getting_started/index.html | 20 +++--- docs/api/python/index.html | 2 +- docs/api/python/modules/index.html | 2 +- docs/api/python/modules/sparknlp.html | 6 +- .../python/modules/sparknlp/annotation.html | 2 +- .../modules/sparknlp/annotation_audio.html | 2 +- .../modules/sparknlp/annotation_image.html | 2 +- .../annotator/audio/hubert_for_ctc.html | 2 +- .../annotator/audio/wav2vec2_for_ctc.html | 2 +- .../annotator/audio/whisper_for_ctc.html | 2 +- .../sparknlp/annotator/chunk2_doc.html | 2 +- .../modules/sparknlp/annotator/chunker.html | 2 +- .../albert_for_question_answering.html | 2 +- .../albert_for_sequence_classification.html | 2 +- .../albert_for_token_classification.html | 2 +- .../bart_for_zero_shot_classification.html | 2 +- .../bert_for_question_answering.html | 2 +- .../bert_for_sequence_classification.html | 2 +- .../bert_for_token_classification.html | 2 +- .../bert_for_zero_shot_classification.html | 2 +- .../camembert_for_question_answering.html | 2 +- ...camembert_for_sequence_classification.html | 2 +- .../camembert_for_token_classification.html | 2 +- .../classifier_dl/classifier_dl.html | 2 +- .../deberta_for_question_answering.html | 2 +- .../deberta_for_sequence_classification.html | 2 +- .../deberta_for_token_classification.html | 2 +- .../distil_bert_for_question_answering.html | 2 +- ...stil_bert_for_sequence_classification.html | 2 +- .../distil_bert_for_token_classification.html | 2 +- ...til_bert_for_zero_shot_classification.html | 2 +- .../longformer_for_question_answering.html | 2 +- ...ongformer_for_sequence_classification.html | 2 +- .../longformer_for_token_classification.html | 2 +- .../classifier_dl/multi_classifier_dl.html | 2 +- .../roberta_for_question_answering.html | 2 +- .../roberta_for_sequence_classification.html | 2 +- .../roberta_for_token_classification.html | 2 +- .../roberta_for_zero_shot_classification.html | 2 +- .../annotator/classifier_dl/sentiment_dl.html | 2 +- .../tapas_for_question_answering.html | 2 +- .../xlm_roberta_for_question_answering.html | 2 +- ...m_roberta_for_sequence_classification.html | 2 +- .../xlm_roberta_for_token_classification.html | 2 +- ..._roberta_for_zero_shot_classification.html | 2 +- .../xlnet_for_sequence_classification.html | 2 +- .../xlnet_for_token_classification.html | 2 +- .../annotator/coref/spanbert_coref.html | 2 +- .../cv/convnext_for_image_classification.html | 2 +- .../cv/swin_for_image_classification.html | 2 +- .../cv/vit_for_image_classification.html | 2 +- .../sparknlp/annotator/date2_chunk.html | 2 +- .../dependency/dependency_parser.html | 2 +- .../dependency/typed_dependency_parser.html | 2 +- .../annotator/document_normalizer.html | 2 +- .../embeddings/albert_embeddings.html | 2 +- .../annotator/embeddings/bert_embeddings.html | 2 +- .../embeddings/bert_sentence_embeddings.html | 2 +- .../embeddings/camembert_embeddings.html | 2 +- .../embeddings/chunk_embeddings.html | 2 +- .../embeddings/deberta_embeddings.html | 2 +- .../embeddings/distil_bert_embeddings.html | 2 +- .../annotator/embeddings/doc2vec.html | 10 ++- .../annotator/embeddings/e5_embeddings.html | 2 +- .../annotator/embeddings/elmo_embeddings.html | 2 +- .../embeddings/instructor_embeddings.html | 2 +- .../embeddings/longformer_embeddings.html | 2 +- .../embeddings/mpnet_embeddings.html | 2 +- .../embeddings/roberta_embeddings.html | 2 +- .../roberta_sentence_embeddings.html | 2 +- .../embeddings/sentence_embeddings.html | 2 +- .../universal_sentence_encoder.html | 2 +- .../annotator/embeddings/word2vec.html | 10 ++- .../annotator/embeddings/word_embeddings.html | 2 +- .../embeddings/xlm_roberta_embeddings.html | 2 +- .../xlm_roberta_sentence_embeddings.html | 2 +- .../embeddings/xlnet_embeddings.html | 2 +- .../sparknlp/annotator/er/entity_ruler.html | 2 +- .../sparknlp/annotator/graph_extraction.html | 2 +- .../yake_keyword_extraction.html | 2 +- .../annotator/ld_dl/language_detector_dl.html | 2 +- .../sparknlp/annotator/lemmatizer.html | 2 +- .../annotator/matcher/big_text_matcher.html | 2 +- .../annotator/matcher/date_matcher.html | 2 +- .../annotator/matcher/multi_date_matcher.html | 2 +- .../annotator/matcher/regex_matcher.html | 2 +- .../annotator/matcher/text_matcher.html | 2 +- .../sparknlp/annotator/n_gram_generator.html | 2 +- .../sparknlp/annotator/ner/ner_approach.html | 2 +- .../sparknlp/annotator/ner/ner_converter.html | 2 +- .../sparknlp/annotator/ner/ner_crf.html | 2 +- .../sparknlp/annotator/ner/ner_dl.html | 2 +- .../annotator/ner/ner_overwriter.html | 2 +- .../annotator/ner/zero_shot_ner_model.html | 2 +- .../sparknlp/annotator/normalizer.html | 2 +- .../annotator/openai/openai_completion.html | 2 +- .../annotator/openai/openai_embeddings.html | 2 +- .../annotator/param/classifier_encoder.html | 2 +- .../annotator/param/evaluation_dl_params.html | 2 +- .../sparknlp/annotator/pos/perceptron.html | 2 +- .../annotator/sentence/sentence_detector.html | 2 +- .../sentence/sentence_detector_dl.html | 2 +- .../sentiment/sentiment_detector.html | 2 +- .../annotator/sentiment/vivekn_sentiment.html | 2 +- .../annotator/seq2seq/bart_transformer.html | 2 +- .../annotator/seq2seq/gpt2_transformer.html | 2 +- .../annotator/seq2seq/marian_transformer.html | 2 +- .../annotator/seq2seq/t5_transformer.html | 2 +- .../document_similarity_ranker.html | 2 +- .../spell_check/context_spell_checker.html | 2 +- .../spell_check/norvig_sweeting.html | 2 +- .../spell_check/symmetric_delete.html | 2 +- .../modules/sparknlp/annotator/stemmer.html | 2 +- .../annotator/stop_words_cleaner.html | 2 +- .../annotator/tf_ner_dl_graph_builder.html | 2 +- .../annotator/token/chunk_tokenizer.html | 2 +- .../annotator/token/recursive_tokenizer.html | 2 +- .../annotator/token/regex_tokenizer.html | 2 +- .../sparknlp/annotator/token/tokenizer.html | 2 +- .../sparknlp/annotator/ws/word_segmenter.html | 2 +- .../sparknlp/base/audio_assembler.html | 2 +- .../modules/sparknlp/base/doc2_chunk.html | 2 +- .../sparknlp/base/document_assembler.html | 2 +- .../sparknlp/base/embeddings_finisher.html | 2 +- .../modules/sparknlp/base/finisher.html | 2 +- .../modules/sparknlp/base/graph_finisher.html | 2 +- .../sparknlp/base/has_recursive_fit.html | 2 +- .../base/has_recursive_transform.html | 2 +- .../sparknlp/base/image_assembler.html | 2 +- .../modules/sparknlp/base/light_pipeline.html | 2 +- .../base/multi_document_assembler.html | 2 +- .../sparknlp/base/recursive_pipeline.html | 2 +- .../sparknlp/base/table_assembler.html | 2 +- .../modules/sparknlp/base/token2_chunk.html | 2 +- .../sparknlp/base/token_assembler.html | 2 +- .../sparknlp/common/annotator_approach.html | 2 +- .../sparknlp/common/annotator_model.html | 2 +- .../sparknlp/common/annotator_properties.html | 2 +- .../sparknlp/common/match_strategy.html | 2 +- .../modules/sparknlp/common/properties.html | 2 +- .../modules/sparknlp/common/read_as.html | 2 +- .../common/recursive_annotator_approach.html | 2 +- .../python/modules/sparknlp/common/utils.html | 2 +- .../python/modules/sparknlp/functions.html | 2 +- .../sparknlp/internal/annotator_java_ml.html | 2 +- .../internal/annotator_transformer.html | 2 +- .../internal/extended_java_wrapper.html | 2 +- .../internal/params_getters_setters.html | 2 +- .../modules/sparknlp/internal/recursive.html | 2 +- .../modules/sparknlp/logging/comet.html | 2 +- .../pretrained/pretrained_pipeline.html | 2 +- .../pretrained/resource_downloader.html | 2 +- .../modules/sparknlp/training/conll.html | 2 +- .../modules/sparknlp/training/conllu.html | 2 +- .../python/modules/sparknlp/training/pos.html | 2 +- .../modules/sparknlp/training/pub_tator.html | 2 +- .../training/spacy_to_annotation.html | 2 +- docs/api/python/objects.inv | Bin 13106 -> 13120 bytes docs/api/python/py-modindex.html | 2 +- .../sparknlp/annotation/index.html | 2 +- .../sparknlp/annotation_audio/index.html | 2 +- .../sparknlp/annotation_image/index.html | 2 +- .../annotator/audio/hubert_for_ctc/index.html | 2 +- .../sparknlp/annotator/audio/index.html | 2 +- .../audio/wav2vec2_for_ctc/index.html | 2 +- .../audio/whisper_for_ctc/index.html | 2 +- .../sparknlp/annotator/chunk2_doc/index.html | 2 +- .../sparknlp/annotator/chunker/index.html | 2 +- .../albert_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../bert_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../bert_for_token_classification/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../classifier_dl/classifier_dl/index.html | 2 +- .../deberta_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../annotator/classifier_dl/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../multi_classifier_dl/index.html | 2 +- .../roberta_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../classifier_dl/sentiment_dl/index.html | 2 +- .../tapas_for_question_answering/index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../index.html | 2 +- .../xlnet_for_token_classification/index.html | 2 +- .../sparknlp/annotator/coref/index.html | 2 +- .../annotator/coref/spanbert_coref/index.html | 2 +- .../index.html | 2 +- .../sparknlp/annotator/cv/index.html | 2 +- .../swin_for_image_classification/index.html | 2 +- .../vit_for_image_classification/index.html | 2 +- .../sparknlp/annotator/date2_chunk/index.html | 2 +- .../dependency/dependency_parser/index.html | 2 +- .../sparknlp/annotator/dependency/index.html | 2 +- .../typed_dependency_parser/index.html | 2 +- .../annotator/document_normalizer/index.html | 2 +- .../embeddings/albert_embeddings/index.html | 2 +- .../embeddings/bert_embeddings/index.html | 2 +- .../bert_sentence_embeddings/index.html | 2 +- .../camembert_embeddings/index.html | 2 +- .../embeddings/chunk_embeddings/index.html | 2 +- .../embeddings/deberta_embeddings/index.html | 2 +- .../distil_bert_embeddings/index.html | 2 +- .../annotator/embeddings/doc2vec/index.html | 10 ++- .../embeddings/e5_embeddings/index.html | 2 +- .../embeddings/elmo_embeddings/index.html | 2 +- .../sparknlp/annotator/embeddings/index.html | 2 +- .../instructor_embeddings/index.html | 2 +- .../longformer_embeddings/index.html | 2 +- .../embeddings/mpnet_embeddings/index.html | 2 +- .../embeddings/roberta_embeddings/index.html | 2 +- .../roberta_sentence_embeddings/index.html | 2 +- .../embeddings/sentence_embeddings/index.html | 2 +- .../universal_sentence_encoder/index.html | 2 +- .../annotator/embeddings/word2vec/index.html | 10 ++- .../embeddings/word_embeddings/index.html | 2 +- .../xlm_roberta_embeddings/index.html | 2 +- .../index.html | 2 +- .../embeddings/xlnet_embeddings/index.html | 2 +- .../annotator/er/entity_ruler/index.html | 2 +- .../sparknlp/annotator/er/index.html | 2 +- .../annotator/graph_extraction/index.html | 2 +- .../autosummary/sparknlp/annotator/index.html | 2 +- .../annotator/keyword_extraction/index.html | 2 +- .../yake_keyword_extraction/index.html | 2 +- .../sparknlp/annotator/ld_dl/index.html | 2 +- .../ld_dl/language_detector_dl/index.html | 2 +- .../sparknlp/annotator/lemmatizer/index.html | 2 +- .../matcher/big_text_matcher/index.html | 2 +- .../annotator/matcher/date_matcher/index.html | 2 +- .../sparknlp/annotator/matcher/index.html | 2 +- .../matcher/multi_date_matcher/index.html | 2 +- .../matcher/regex_matcher/index.html | 2 +- .../annotator/matcher/text_matcher/index.html | 2 +- .../annotator/n_gram_generator/index.html | 2 +- .../sparknlp/annotator/ner/index.html | 2 +- .../annotator/ner/ner_approach/index.html | 2 +- .../annotator/ner/ner_converter/index.html | 2 +- .../sparknlp/annotator/ner/ner_crf/index.html | 2 +- .../sparknlp/annotator/ner/ner_dl/index.html | 2 +- .../annotator/ner/ner_overwriter/index.html | 2 +- .../ner/zero_shot_ner_model/index.html | 2 +- .../sparknlp/annotator/normalizer/index.html | 2 +- .../sparknlp/annotator/openai/index.html | 2 +- .../openai/openai_completion/index.html | 2 +- .../openai/openai_embeddings/index.html | 2 +- .../param/classifier_encoder/index.html | 2 +- .../param/evaluation_dl_params/index.html | 2 +- .../sparknlp/annotator/param/index.html | 2 +- .../sparknlp/annotator/pos/index.html | 2 +- .../annotator/pos/perceptron/index.html | 2 +- .../sparknlp/annotator/sentence/index.html | 2 +- .../sentence/sentence_detector/index.html | 2 +- .../sentence/sentence_detector_dl/index.html | 2 +- .../sparknlp/annotator/sentiment/index.html | 2 +- .../sentiment/sentiment_detector/index.html | 2 +- .../sentiment/vivekn_sentiment/index.html | 2 +- .../seq2seq/bart_transformer/index.html | 2 +- .../seq2seq/gpt2_transformer/index.html | 2 +- .../sparknlp/annotator/seq2seq/index.html | 2 +- .../seq2seq/marian_transformer/index.html | 2 +- .../seq2seq/t5_transformer/index.html | 2 +- .../document_similarity_ranker/index.html | 2 +- .../sparknlp/annotator/similarity/index.html | 2 +- .../context_spell_checker/index.html | 2 +- .../sparknlp/annotator/spell_check/index.html | 2 +- .../spell_check/norvig_sweeting/index.html | 2 +- .../spell_check/symmetric_delete/index.html | 2 +- .../sparknlp/annotator/stemmer/index.html | 2 +- .../annotator/stop_words_cleaner/index.html | 2 +- .../tf_ner_dl_graph_builder/index.html | 2 +- .../token/chunk_tokenizer/index.html | 2 +- .../sparknlp/annotator/token/index.html | 2 +- .../token/recursive_tokenizer/index.html | 2 +- .../token/regex_tokenizer/index.html | 2 +- .../annotator/token/tokenizer/index.html | 2 +- .../sparknlp/annotator/ws/index.html | 2 +- .../annotator/ws/word_segmenter/index.html | 2 +- .../sparknlp/base/audio_assembler/index.html | 2 +- .../sparknlp/base/doc2_chunk/index.html | 2 +- .../base/document_assembler/index.html | 2 +- .../base/embeddings_finisher/index.html | 2 +- .../sparknlp/base/finisher/index.html | 2 +- .../sparknlp/base/graph_finisher/index.html | 2 +- .../base/has_recursive_fit/index.html | 2 +- .../base/has_recursive_transform/index.html | 2 +- .../sparknlp/base/image_assembler/index.html | 2 +- .../autosummary/sparknlp/base/index.html | 2 +- .../sparknlp/base/light_pipeline/index.html | 2 +- .../base/multi_document_assembler/index.html | 2 +- .../base/recursive_pipeline/index.html | 2 +- .../sparknlp/base/table_assembler/index.html | 2 +- .../sparknlp/base/token2_chunk/index.html | 2 +- .../sparknlp/base/token_assembler/index.html | 2 +- .../common/annotator_approach/index.html | 2 +- .../common/annotator_model/index.html | 2 +- .../common/annotator_properties/index.html | 2 +- .../sparknlp/common/annotator_type/index.html | 2 +- .../common/coverage_result/index.html | 2 +- .../autosummary/sparknlp/common/index.html | 2 +- .../sparknlp/common/match_strategy/index.html | 2 +- .../sparknlp/common/properties/index.html | 2 +- .../sparknlp/common/read_as/index.html | 2 +- .../recursive_annotator_approach/index.html | 2 +- .../sparknlp/common/storage/index.html | 2 +- .../sparknlp/common/utils/index.html | 2 +- .../autosummary/sparknlp/functions/index.html | 2 +- .../reference/autosummary/sparknlp/index.html | 2 +- .../internal/annotator_java_ml/index.html | 2 +- .../internal/annotator_transformer/index.html | 2 +- .../internal/extended_java_wrapper/index.html | 2 +- .../autosummary/sparknlp/internal/index.html | 2 +- .../params_getters_setters/index.html | 2 +- .../sparknlp/internal/recursive/index.html | 2 +- .../sparknlp/logging/comet/index.html | 2 +- .../autosummary/sparknlp/logging/index.html | 2 +- .../sparknlp/pretrained/index.html | 2 +- .../pretrained/pretrained_pipeline/index.html | 2 +- .../pretrained/resource_downloader/index.html | 2 +- .../sparknlp/pretrained/utils/index.html | 2 +- .../sparknlp/training/conll/index.html | 2 +- .../sparknlp/training/conllu/index.html | 2 +- .../autosummary/sparknlp/training/index.html | 2 +- .../sparknlp/training/pos/index.html | 2 +- .../sparknlp/training/pub_tator/index.html | 2 +- .../training/spacy_to_annotation/index.html | 2 +- .../sparknlp/training/tfgraphs/index.html | 2 +- .../sparknlp/upload_to_hub/index.html | 2 +- .../autosummary/sparknlp/util/index.html | 2 +- docs/api/python/reference/index.html | 2 +- docs/api/python/search.html | 2 +- docs/api/python/searchindex.js | 2 +- .../python/static/documentation_options.js | 2 +- docs/api/python/third_party/Comet.html | 2 +- docs/api/python/third_party/MLflow.html | 2 +- docs/api/python/third_party/index.html | 2 +- docs/api/python/user_guide/annotation.html | 2 +- docs/api/python/user_guide/annotators.html | 2 +- .../python/user_guide/custom_pipelines.html | 2 +- docs/api/python/user_guide/helpers.html | 2 +- docs/api/python/user_guide/index.html | 2 +- .../python/user_guide/light_pipelines.html | 2 +- .../user_guide/pretrained_pipelines.html | 2 +- docs/api/python/user_guide/training.html | 2 +- docs/api/scala/collection/compat/index.html | 8 +-- docs/api/scala/collection/index.html | 8 +-- docs/api/scala/index.html | 8 +-- .../nlp/embeddings/Doc2VecTestSpec.scala | 6 +- 1419 files changed, 5261 insertions(+), 4657 deletions(-) diff --git a/docs/api/com/index.html b/docs/api/com/index.html index b75aa4ef4a5a0a..c5a299921a234d 100644 --- a/docs/api/com/index.html +++ b/docs/api/com/index.html @@ -3,9 +3,9 @@ - Spark NLP 5.1.0 ScalaDoc - com - - + Spark NLP 5.1.1 ScalaDoc - com + + @@ -28,7 +28,7 @@