diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh.md new file mode 100644 index 00000000000000..169e9a31db0eb5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Chinese Deberta Embeddings Cased model (from IDEA-CCNL) +author: John Snow Labs +name: deberta_embeddings_erlangshen_v2_chinese_sentencepiece +date: 2023-06-26 +tags: [open_source, deberta, deberta_embeddings, debertav2formaskedlm, zh, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaV2ForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `Erlangshen-DeBERTa-v2-186M-Chinese-SentencePiece` is a Chinese model originally trained by `IDEA-CCNL`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh_5.0.0_3.0_1687781761029.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_erlangshen_v2_chinese_sentencepiece_zh_5.0.0_3.0_1687781761029.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_erlangshen_v2_chinese_sentencepiece","zh") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_erlangshen_v2_chinese_sentencepiece| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|zh| +|Size:|443.8 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_mlm_test_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_mlm_test_en.md new file mode 100644 index 00000000000000..2ee4b096c3fc6c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_mlm_test_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Deberta Embeddings model (from domenicrosati) +author: John Snow Labs +name: deberta_embeddings_mlm_test +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-mlm-test` is a English model originally trained by `domenicrosati`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_mlm_test_en_5.0.0_3.0_1687782209221.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_mlm_test_en_5.0.0_3.0_1687782209221.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_mlm_test","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_mlm_test| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|265.4 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_spm_vie_vie.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_spm_vie_vie.md new file mode 100644 index 00000000000000..8f303ff4ccea17 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_spm_vie_vie.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Vietnamese Deberta Embeddings model (from hieule) +author: John Snow Labs +name: deberta_embeddings_spm_vie +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, vie, onnx] +task: Embeddings +language: vie +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `spm-vie-deberta` is a Vietnamese model originally trained by `hieule`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_spm_vie_vie_5.0.0_3.0_1687780843112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_spm_vie_vie_5.0.0_3.0_1687780843112.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_spm_vie","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_spm_vie| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|vie| +|Size:|289.7 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_tapt_nbme_v3_base_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_tapt_nbme_v3_base_en.md new file mode 100644 index 00000000000000..30f49fc915d1a6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_tapt_nbme_v3_base_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Deberta Embeddings model (from ZZ99) +author: John Snow Labs +name: deberta_embeddings_tapt_nbme_v3_base +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `tapt_nbme_deberta_v3_base` is a English model originally trained by `ZZ99`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_tapt_nbme_v3_base_en_5.0.0_3.0_1687780869777.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_tapt_nbme_v3_base_en_5.0.0_3.0_1687780869777.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_tapt_nbme_v3_base","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_tapt_nbme_v3_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|687.5 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_vie_small_vie.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_vie_small_vie.md new file mode 100644 index 00000000000000..6b33b485b4900c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_vie_small_vie.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Vietnamese Deberta Embeddings model (from binhquoc) +author: John Snow Labs +name: deberta_embeddings_vie_small +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, vie, onnx] +task: Embeddings +language: vie +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `vie-deberta-small` is a Vietnamese model originally trained by `binhquoc`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_vie_small_vie_5.0.0_3.0_1687780922709.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_vie_small_vie_5.0.0_3.0_1687780922709.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_vie_small","vie") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_vie_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|vie| +|Size:|277.4 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en.md new file mode 100644 index 00000000000000..50107e47b2c704 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Deberta Embeddings model (from domenicrosati) +author: John Snow Labs +name: deberta_embeddings_xsmall_dapt_scientific_papers_pubmed +date: 2023-06-26 +tags: [deberta, open_source, deberta_embeddings, debertav2formaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DebertaEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-xsmall-dapt-scientific-papers-pubmed` is a English model originally trained by `domenicrosati`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en_5.0.0_3.0_1687780385270.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_embeddings_xsmall_dapt_scientific_papers_pubmed_en_5.0.0_3.0_1687780385270.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = DeBertaEmbeddings.pretrained("deberta_embeddings_xsmall_dapt_scientific_papers_pubmed","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_embeddings_xsmall_dapt_scientific_papers_pubmed| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|244.9 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_small_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_small_en.md new file mode 100644 index 00000000000000..4f13393720b21f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_small_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa small model +author: John Snow Labs +name: deberta_v3_small +date: 2023-06-26 +tags: [en, english, embeddings, deberta, v3, small, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_small_en_5.0.0_3.0_1687783064877.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_small_en_5.0.0_3.0_1687783064877.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_small").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_small", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_small").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|334.1 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_xsmall_en.md b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_xsmall_en.md new file mode 100644 index 00000000000000..9f3baaee954b9e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-deberta_v3_xsmall_en.md @@ -0,0 +1,102 @@ +--- +layout: model +title: DeBERTa xsmall model +author: John Snow Labs +name: deberta_v3_xsmall +date: 2023-06-26 +tags: [en, english, embeddings, deberta, xsmall, v3, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_xsmall_en_5.0.0_3.0_1687782011152.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_xsmall_en_5.0.0_3.0_1687782011152.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") + +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") + +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_xsmall").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_xsmall", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_xsmall").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_xsmall| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|167.8 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_BERTino_it.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_BERTino_it.md new file mode 100644 index 00000000000000..7233e7dd8045ec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_BERTino_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_BERTino +date: 2023-06-26 +tags: [distilbert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `BERTino` is a Italian model orginally trained by `indigo-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_BERTino_it_5.0.0_3.0_1687777390566.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_BERTino_it_5.0.0_3.0_1687777390566.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.BERTino").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_BERTino","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.BERTino").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_BERTino| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|253.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_indonesian_id.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_indonesian_id.md new file mode 100644 index 00000000000000..4e1fad3954642c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_indonesian_id.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Indonesian DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_indonesian +date: 2023-06-26 +tags: [distilbert, embeddings, id, open_source, onnx] +task: Embeddings +language: id +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-indonesian` is a Indonesian model orginally trained by `cahya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_indonesian_id_5.0.0_3.0_1687777360898.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_indonesian_id_5.0.0_3.0_1687777360898.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.distilbert").predict("""Saya suka percikan NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka percikan NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_indonesian","id") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka percikan NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("id.embed.distilbert").predict("""Saya suka percikan NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_indonesian| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|id| +|Size:|253.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en.md new file mode 100644 index 00000000000000..9202845526f1ca --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English DistilBERT Embeddings (%85 sparse) +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa +date: 2023-06-26 +tags: [distilbert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-uncased-sparse-85-unstructured-pruneofa` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en_5.0.0_3.0_1687777999251.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa_en_5.0.0_3.0_1687777999251.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_85_unstructured_pruneofa").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_85_unstructured_pruneofa").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_uncased_sparse_85_unstructured_pruneofa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|132.5 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en.md new file mode 100644 index 00000000000000..d172172048e1b6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English DistilBERT Embeddings (%90 sparse) +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa +date: 2023-06-26 +tags: [distilbert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-uncased-sparse-90-unstructured-pruneofa` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en_5.0.0_3.0_1687778292303.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa_en_5.0.0_3.0_1687778292303.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_90_unstructured_pruneofa").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert_base_uncased_sparse_90_unstructured_pruneofa").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_uncased_sparse_90_unstructured_pruneofa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|123.3 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_fa_zwnj_base_fa.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_fa_zwnj_base_fa.md new file mode 100644 index 00000000000000..6e1c4d47a4a93b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_distilbert_fa_zwnj_base_fa.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Persian DistilBERT Embeddings (from HooshvareLab) +author: John Snow Labs +name: distilbert_embeddings_distilbert_fa_zwnj_base +date: 2023-06-26 +tags: [distilbert, embeddings, fa, open_source, onnx] +task: Embeddings +language: fa +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-fa-zwnj-base` is a Persian model orginally trained by `HooshvareLab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_fa_zwnj_base_fa_5.0.0_3.0_1687778060683.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_fa_zwnj_base_fa_5.0.0_3.0_1687778060683.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["من عاشق جرقه NLP هستم"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("من عاشق جرقه NLP هستم").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("fa.embed.distilbert_fa_zwnj_base").predict("""من عاشق جرقه NLP هستم""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["من عاشق جرقه NLP هستم"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_fa_zwnj_base","fa") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("من عاشق جرقه NLP هستم").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("fa.embed.distilbert_fa_zwnj_base").predict("""من عاشق جرقه NLP هستم""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_fa_zwnj_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|fa| +|Size:|282.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_finetuned_sarcasm_classification_en.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_finetuned_sarcasm_classification_en.md new file mode 100644 index 00000000000000..7df586e990947a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_finetuned_sarcasm_classification_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English DistilBERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: distilbert_embeddings_finetuned_sarcasm_classification +date: 2023-06-26 +tags: [open_source, distilbert, embeddings, sarcasm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `distilbert_embeddings_finetuned_sarcasm_classification` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_finetuned_sarcasm_classification_en_5.0.0_3.0_1687777366459.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_finetuned_sarcasm_classification_en_5.0.0_3.0_1687777366459.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distil_bert.finetuned").predict("""PUT YOUR STRING HERE.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_finetuned_sarcasm_classification","en") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distil_bert.finetuned").predict("""PUT YOUR STRING HERE.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_finetuned_sarcasm_classification| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|247.2 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_bn_distilbert_bn.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_bn_distilbert_bn.md new file mode 100644 index 00000000000000..750a960889bb4e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_bn_distilbert_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla DistilBERT Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: distilbert_embeddings_indic_transformers_bn_distilbert +date: 2023-06-26 +tags: [distilbert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-bn-distilbert` is a Bangla model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_bn_distilbert_bn_5.0.0_3.0_1687778001310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_bn_distilbert_bn_5.0.0_3.0_1687778001310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.indic_transformers_bn_distilbert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_bn_distilbert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.indic_transformers_bn_distilbert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_indic_transformers_bn_distilbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|248.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_hi_distilbert_hi.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_hi_distilbert_hi.md new file mode 100644 index 00000000000000..709aae71d5d5cb --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_indic_transformers_hi_distilbert_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi DistilBERT Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: distilbert_embeddings_indic_transformers_hi_distilbert +date: 2023-06-26 +tags: [distilbert, embeddings, hi, open_source, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-hi-distilbert` is a Hindi model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_hi_distilbert_hi_5.0.0_3.0_1687778274733.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_indic_transformers_hi_distilbert_hi_5.0.0_3.0_1687778274733.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_distilbert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_indic_transformers_hi_distilbert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_distilbert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_indic_transformers_hi_distilbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|hi| +|Size:|247.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_imdb_jv.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_imdb_jv.md new file mode 100644 index 00000000000000..9ddef8bf45a2fc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_imdb_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese DistilBERT Embeddings (Small, Imdb) +author: John Snow Labs +name: distilbert_embeddings_javanese_distilbert_small_imdb +date: 2023-06-26 +tags: [distilbert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-distilbert-small-imdb` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_imdb_jv_5.0.0_3.0_1687778138415.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_imdb_jv_5.0.0_3.0_1687778138415.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_distilbert_small_imdb").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_distilbert_small_imdb").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_javanese_distilbert_small_imdb| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|247.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_jv.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_jv.md new file mode 100644 index 00000000000000..15b0dcb0b86cda --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_javanese_distilbert_small_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese DistilBERT Embeddings (Small, Wikipedia) +author: John Snow Labs +name: distilbert_embeddings_javanese_distilbert_small +date: 2023-06-26 +tags: [distilbert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-distilbert-small` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_jv_5.0.0_3.0_1687778132742.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_javanese_distilbert_small_jv_5.0.0_3.0_1687778132742.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.distilbert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_javanese_distilbert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.distilbert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_javanese_distilbert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|247.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_malaysian_distilbert_small_ms.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_malaysian_distilbert_small_ms.md new file mode 100644 index 00000000000000..9092f73999f3c2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_malaysian_distilbert_small_ms.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Malay DistilBERT Embeddings (from w11wo) +author: John Snow Labs +name: distilbert_embeddings_malaysian_distilbert_small +date: 2023-06-26 +tags: [distilbert, embeddings, ms, open_source, onnx] +task: Embeddings +language: ms +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `malaysian-distilbert-small` is a Malay model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_malaysian_distilbert_small_ms_5.0.0_3.0_1687777995710.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_malaysian_distilbert_small_ms_5.0.0_3.0_1687777995710.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ms.embed.distilbert").predict("""Saya suka Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Saya suka Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_malaysian_distilbert_small","ms") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Saya suka Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ms.embed.distilbert").predict("""Saya suka Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_malaysian_distilbert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ms| +|Size:|248.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_marathi_distilbert_mr.md b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_marathi_distilbert_mr.md new file mode 100644 index 00000000000000..9ca015f1047ab5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-distilbert_embeddings_marathi_distilbert_mr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Marathi DistilBERT Embeddings (from DarshanDeshpande) +author: John Snow Labs +name: distilbert_embeddings_marathi_distilbert +date: 2023-06-26 +tags: [distilbert, embeddings, mr, open_source, onnx] +task: Embeddings +language: mr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `marathi-distilbert` is a Marathi model orginally trained by `DarshanDeshpande`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_marathi_distilbert_mr_5.0.0_3.0_1687778299122.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_marathi_distilbert_mr_5.0.0_3.0_1687778299122.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.distilbert").predict("""मला स्पार्क एनएलपी आवडते""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_marathi_distilbert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.distilbert").predict("""मला स्पार्क एनएलपी आवडते""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_marathi_distilbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|mr| +|Size:|247.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-26-roberta_base_swiss_legal_gsw.md b/docs/_posts/ahmedlone127/2023-06-26-roberta_base_swiss_legal_gsw.md new file mode 100644 index 00000000000000..8cc4701ee197fc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-26-roberta_base_swiss_legal_gsw.md @@ -0,0 +1,80 @@ +--- +layout: model +title: Swiss Legal Roberta Embeddings +author: John Snow Labs +name: roberta_base_swiss_legal +date: 2023-06-26 +tags: [gsw, swiss, embeddings, transformer, open_source, legal, onnx] +task: Embeddings +language: gsw +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Roberta Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-swiss-roberta-base` is a Swiss model originally trained by `joelito`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_swiss_legal_gsw_5.0.0_3.0_1687788882271.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_swiss_legal_gsw_5.0.0_3.0_1687788882271.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw")\ + .setInputCols(["sentence"])\ + .setOutputCol("embeddings") +``` +```scala +val sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw") + .setInputCols("sentence") + .setOutputCol("embeddings")) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw")\ + .setInputCols(["sentence"])\ + .setOutputCol("embeddings") +``` +```scala +val sentence_embeddings = RoBertaEmbeddings.pretrained("roberta_base_swiss_legal", "gsw") + .setInputCols("sentence") + .setOutputCol("embeddings")) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base_swiss_legal| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|gsw| +|Size:|692.1 MB| +|Case sensitive:|true| \ No newline at end of file