From f7e848ef3dc08457ef601a91917d2a99c269f3f8 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 14:54:09 +0700 Subject: [PATCH 001/149] Add model 2023-06-21-bert_embeddings_distil_clinical_en --- ...6-21-bert_embeddings_distil_clinical_en.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md new file mode 100644 index 00000000000000..33f643e2799896 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from nlpie) +author: John Snow Labs +name: bert_embeddings_distil_clinical +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `distil-clinicalbert` is a English model originally trained by `nlpie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.4_1687334036385.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.4_1687334036385.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_distil_clinical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|244.5 MB| +|Case sensitive:|true| \ No newline at end of file From c5cfc17b666fd1549279f95f36c56561515672a3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 14:56:08 +0700 Subject: [PATCH 002/149] Add model 2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en --- ...mbeddings_carlbert_webex_mlm_spatial_en.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md new file mode 100644 index 00000000000000..e76ff9449ab0d3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from aditeyabaral) +author: John Snow Labs +name: bert_embeddings_carlbert_webex_mlm_spatial +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `carlbert-webex-mlm-spatial` is a English model originally trained by `aditeyabaral`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.4_1687334153231.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.4_1687334153231.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_carlbert_webex_mlm_spatial| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file From 2231806ba270edd7b17406f687e5c7183e0ebc68 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:21:14 +0700 Subject: [PATCH 003/149] Add model 2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en --- ...s_chemical_uncased_finetuned_cust_c2_en.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md new file mode 100644 index 00000000000000..1243bba960dfaf --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Shafin) +author: John Snow Labs +name: bert_embeddings_chemical_uncased_finetuned_cust_c2 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `chemical-bert-uncased-finetuned-cust-c2` is a English model originally trained by `shafin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.4_1687335658105.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.4_1687335658105.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_uncased_finetuned_cust_c2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file From 2e406fe38eb080d6670c4da1df4ddb19dccb1718 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:22:44 +0700 Subject: [PATCH 004/149] Add model 2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it --- ...bert_embeddings_lsg16k_Italian_Legal_it.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md new file mode 100644 index 00000000000000..dc8d5a0bba5e06 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English Legal BERT Embeddings +author: John Snow Labs +name: bert_embeddings_lsg16k_Italian_Legal +date: 2023-06-21 +tags: [longformer, it, italian, embeddings, transformer, open_source, tensorflow, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `lsg16k-Italian-Legal-BERT` is a Italian model originally trained by `dlicari`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.4_1687335744395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.4_1687335744395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_lsg16k_Italian_Legal| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|454.6 MB| +|Case sensitive:|true| \ No newline at end of file From a2183c990175958dddca62a6932eb9c41f37c163 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:24:12 +0700 Subject: [PATCH 005/149] Add model 2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en --- ...mical_uncased_finetuned_cust_c1_cust_en.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md new file mode 100644 index 00000000000000..e4feb2c5bd8623 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Shafin) +author: John Snow Labs +name: bert_embeddings_chemical_uncased_finetuned_cust_c1_cust +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `chemical-bert-uncased-finetuned-cust-c1-cust` is a English model originally trained by `Shafin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.4_1687335830911.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.4_1687335830911.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_uncased_finetuned_cust_c1_cust| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file From ee2589944de2918b74bc0b777624e0edd45f8347 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:25:35 +0700 Subject: [PATCH 006/149] Add model 2023-06-21-bert_embeddings_legalbert_adept_en --- ...6-21-bert_embeddings_legalbert_adept_en.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md new file mode 100644 index 00000000000000..c6fb37952b5c59 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English Legal BERT Embeddings +author: John Snow Labs +name: bert_embeddings_legalbert_adept +date: 2023-06-21 +tags: [bert, en, english, embeddings, transformer, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalbert-adept` is a English model originally trained by `hatemestinbejaia`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.4_1687335917569.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.4_1687335917569.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_legalbert_adept| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file From 8336ae4de6ad2b36f7c8efa2f03bfb78084d58e6 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:30:04 +0700 Subject: [PATCH 007/149] Add model 2023-06-21-bert_embeddings_base_uncased_issues_128_en --- ...t_embeddings_base_uncased_issues_128_en.md | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md new file mode 100644 index 00000000000000..7a81f1682737e4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md @@ -0,0 +1,141 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from antoinev17) +author: John Snow Labs +name: bert_embeddings_base_uncased_issues_128 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-issues-128` is a English model originally trained by `antoinev17 +`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.4_1687336183958.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.4_1687336183958.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_base_uncased_issues_128| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file From 9c5961032dd09e4ce622be1cc452b66272861918 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:31:10 +0700 Subject: [PATCH 008/149] Add model 2023-06-21-bert_embeddings_pretrain_ko --- .../2023-06-21-bert_embeddings_pretrain_ko.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md new file mode 100644 index 00000000000000..530f8a4947793c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Korean Bert Embeddings Cased model (from onlydj96) +author: John Snow Labs +name: bert_embeddings_pretrain +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, ko, tensorflow, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert_pretrain` is a Korean model originally trained by `onlydj96`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.4_1687336252702.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.4_1687336252702.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_pretrain| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file From 159c9069c2551e1b9ede6108f631a94874efae3d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:32:10 +0700 Subject: [PATCH 009/149] Add model 2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en --- ...embeddings_olm_base_uncased_oct_2022_en.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md new file mode 100644 index 00000000000000..371264a43afc17 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Tristan) +author: John Snow Labs +name: bert_embeddings_olm_base_uncased_oct_2022 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `olm-bert-base-uncased-oct-2022` is a English model originally trained by `Tristan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.4_1687336305222.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.4_1687336305222.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_olm_base_uncased_oct_2022| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|464.6 MB| +|Case sensitive:|true| \ No newline at end of file From 5bf420d0a60c73b2e1aa9d7470430d63828b412c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:34:55 +0700 Subject: [PATCH 010/149] Add model 2023-06-21-legalectra_small_es --- .../2023-06-21-legalectra_small_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md new file mode 100644 index 00000000000000..8f217918038423 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Electra Legal Word Embeddings Small model +author: John Snow Labs +name: legalectra_small +date: 2023-06-21 +tags: [open_source, legalectra, embeddings, electra, legal, small, es, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalectra-small-spanish` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.4_1687336489949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.4_1687336489949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_small","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_small","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.small_legal").predict("""Amo a Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_small","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_small","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.small_legal").predict("""Amo a Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legalectra_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|51.2 MB| +|Case sensitive:|true| \ No newline at end of file From 55f9f3f5eb245f038f0336a7e2e60b6973dbf3fa Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:35:55 +0700 Subject: [PATCH 011/149] Add model 2023-06-21-biobert_pubmed_base_cased_v1.2_en --- ...06-21-biobert_pubmed_base_cased_v1.2_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md new file mode 100644 index 00000000000000..c7d99cb4660296 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: BioBERT Embeddings (Pubmed) +author: John Snow Labs +name: biobert_pubmed_base_cased_v1.2 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is the v1.2 of [biobert_pubmed_base_cased](https://nlp.johnsnowlabs.com/2020/09/19/biobert_pubmed_base_cased.html) model and contains pre-trained weights of BioBERT, a language representation model for biomedical domain, especially designed for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, question answering, etc. The details are described in the paper "[BioBERT: a pre-trained biomedical language representation model for biomedical text mining](https://arxiv.org/abs/1901.08746v2)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.4_1687336480762.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.4_1687336480762.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I hate cancer"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I hate cancer").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.biobert.pubmed.cased_base").predict("""I hate cancer""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I hate cancer"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I hate cancer").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.biobert.pubmed.cased_base").predict("""I hate cancer""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_pubmed_base_cased_v1.2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file From 22d5aa64de496a0994f583f9bb7d01b22354dac0 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:36:55 +0700 Subject: [PATCH 012/149] Add model 2023-06-21-bert_embeddings_jobbert_base_cased_en --- ...1-bert_embeddings_jobbert_base_cased_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md new file mode 100644 index 00000000000000..db07e2246747a5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings (from jjzha) +author: John Snow Labs +name: bert_embeddings_jobbert_base_cased +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `jobbert-base-cased ` is a English model originally trained by `jjzha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.4_1687336524220.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.4_1687336524220.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.cased_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.cased_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_jobbert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|402.2 MB| +|Case sensitive:|true| \ No newline at end of file From cb177d65b8f8bbef6920294e0a6e50d8c5ee14b0 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:37:55 +0700 Subject: [PATCH 013/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de --- ..._base_gc4_64k_700000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md new file mode 100644 index 00000000000000..74536373269704 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_700000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-700000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.4_1687336559193.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.4_1687336559193.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_700000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_700000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_700000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file From 649b171485dfa9f49993f57ea5b4b3253ccd1266 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:38:56 +0700 Subject: [PATCH 014/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de --- ..._base_gc4_64k_800000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md new file mode 100644 index 00000000000000..2ee16b3cc0b382 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_800000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-800000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.4_1687336668760.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.4_1687336668760.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_800000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_800000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_800000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file From bb4310b816ce5a755727174d241879faafc76744 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:39:56 +0700 Subject: [PATCH 015/149] Add model 2023-06-21-legalectra_base_es --- .../2023-06-21-legalectra_base_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md new file mode 100644 index 00000000000000..c18507c00caedd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Legal Electra Word Embeddings Base model +author: John Snow Labs +name: legalectra_base +date: 2023-06-21 +tags: [open_source, legalectra, embeddings, electra, legal, es, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalectra-base-spanish` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.4_1687336669896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.4_1687336669896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_base","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_base","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.base_legal").predict("""Amo a Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_base","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_base","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.base_legal").predict("""Amo a Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legalectra_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|408.5 MB| +|Case sensitive:|true| \ No newline at end of file From eda21c7a889ff0d14d29fd5a5582ed50614d11ee Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:40:56 +0700 Subject: [PATCH 016/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de --- ..._base_gc4_64k_900000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md new file mode 100644 index 00000000000000..08ffc8924552f0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_900000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-900000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.4_1687336789214.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.4_1687336789214.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_900000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_900000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_900000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file From 2ca91e57c065d1052dab8e0f718ce4f0e7ebedb9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:41:56 +0700 Subject: [PATCH 017/149] Add model 2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en --- ...gs_scibert_scivocab_finetuned_cord19_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md new file mode 100644 index 00000000000000..5376ed57e932ae --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: bert_embeddings_scibert_scivocab_finetuned_cord19 +date: 2023-06-21 +tags: [en, open_source, bert, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `scibert_scivocab-finetuned-CORD19` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.4_1687336817133.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.4_1687336817133.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.scibert.cord19_scibert.finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.scibert.cord19_scibert.finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_scibert_scivocab_finetuned_cord19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|409.8 MB| +|Case sensitive:|true| \ No newline at end of file From 83bde351bfa7b7a48b26712179e1ff0008f0ab44 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:42:56 +0700 Subject: [PATCH 018/149] Add model 2023-06-21-bert_embeddings_InLegalBERT_en --- ...23-06-21-bert_embeddings_InLegalBERT_en.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md new file mode 100644 index 00000000000000..c485e4c203f876 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Legal English BERT Embeddings (from law-ai) +author: John Snow Labs +name: bert_embeddings_InLegalBERT +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `InLegalBERT` is a English model originally trained by `law-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.4_1687336959265.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.4_1687336959265.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_InLegalBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file From dfd25ab38e517d4d3d3b9bd80f51c5b7a8ff1117 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:43:56 +0700 Subject: [PATCH 019/149] Add model 2023-06-21-bert_embeddings_InCaseLawBERT_en --- ...-06-21-bert_embeddings_InCaseLawBERT_en.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md new file mode 100644 index 00000000000000..bb601d4aa558d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English BERT Embeddings (from law-ai) +author: John Snow Labs +name: bert_embeddings_InCaseLawBERT +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `InCaseLawBERT` is a English model originally trained by `law-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.4_1687336500304.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.4_1687336500304.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_InCaseLawBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.8 MB| +|Case sensitive:|true| \ No newline at end of file From c39d6e6a960007a6d40766a50d274f86b83b3eda Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:45:16 +0700 Subject: [PATCH 020/149] Add model 2023-06-21-bert_base_uncased_contracts_en --- ...23-06-21-bert_base_uncased_contracts_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md new file mode 100644 index 00000000000000..7940efd351cf2b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Legal Contracts BertEmbeddings model (Base, Uncased) +author: John Snow Labs +name: bert_base_uncased_contracts +date: 2023-06-21 +tags: [open_source, bert, embeddings, finance, contracts, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Word Embeddings model, trained on legal contracts, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-contracts` is a English model originally trained by `nlpaueb`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.4_1687337099443.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.4_1687337099443.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP.").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP.").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_uncased_contracts| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file From e965624f1e282d6302bf0f3d0142e28aeaac67b3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:47:35 +0700 Subject: [PATCH 021/149] Add model 2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr --- ...a_base_turkish_mc4_uncased_generator_tr.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md new file mode 100644 index 00000000000000..27834e8f8f4ba8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Turkish Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_turkish_mc4_uncased_generator +date: 2023-06-21 +tags: [tr, open_source, electra, embeddings, onnx] +task: Embeddings +language: tr +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-turkish-mc4-uncased-generator` is a Turkish model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.4_1687337246703.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.4_1687337246703.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.uncased_base").predict("""Spark NLP'yi seviyorum""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.uncased_base").predict("""Spark NLP'yi seviyorum""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_turkish_mc4_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tr| +|Size:|130.0 MB| +|Case sensitive:|false| \ No newline at end of file From 7ae00449cf4575cf2d01fa27633a939a31b53ed3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:48:41 +0700 Subject: [PATCH 022/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de --- ..._base_gc4_64k_500000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md new file mode 100644 index 00000000000000..34fc199ce62012 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_500000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-500000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.4_1687337310787.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.4_1687337310787.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_500000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_500000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_500000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file From 1b3351a8152644bcb7ae048b877784d196ef3eb4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:49:41 +0700 Subject: [PATCH 023/149] Add model 2023-06-21-electra_embeddings_electra_base_generator_en --- ...ra_embeddings_electra_base_generator_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md new file mode 100644 index 00000000000000..1134d2ff11d8f7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_base_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.4_1687337315482.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.4_1687337315482.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|125.7 MB| +|Case sensitive:|true| \ No newline at end of file From 77847d22f11b23c2851b3a57919e0a7777c6bd20 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:50:42 +0700 Subject: [PATCH 024/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de --- ..._base_gc4_64k_200000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md new file mode 100644 index 00000000000000..2c1aec5f485cb5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_200000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-200000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.4_1687337323809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.4_1687337323809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_200000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_200000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_200000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file From f58dde9cd43cd29e24808e62deceae6dc72d07bc Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:51:42 +0700 Subject: [PATCH 025/149] Add model 2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it --- ...tra_base_italian_xxl_cased_generator_it.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md new file mode 100644 index 00000000000000..e7d940835a2164 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_italian_xxl_cased_generator +date: 2023-06-21 +tags: [it, open_source, electra, embeddings, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-italian-xxl-cased-generator` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.4_1687337384147.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.4_1687337384147.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.electra.cased_xxl_base").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.electra.cased_xxl_base").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_italian_xxl_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|it| +|Size:|127.4 MB| +|Case sensitive:|true| \ No newline at end of file From 0182101201d1157541185d1f8fcb895fad354320 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:52:42 +0700 Subject: [PATCH 026/149] Add model 2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en --- ...oclinicalbert_finetuned_covid_papers_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md new file mode 100644 index 00000000000000..ae97a5cf8eece2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: bert_embeddings_bioclinicalbert_finetuned_covid_papers +date: 2023-06-21 +tags: [en, open_source, bert, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bioclinicalBERT-finetuned-covid-papers` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.4_1687337369326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.4_1687337369326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.covid_bio_clinical.finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.covid_bio_clinical.finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bioclinicalbert_finetuned_covid_papers| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|403.2 MB| +|Case sensitive:|true| \ No newline at end of file From 88663c770130357b8de3d3a34840c456164dc863 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:53:42 +0700 Subject: [PATCH 027/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de --- ...base_gc4_64k_1000000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md new file mode 100644 index 00000000000000..010216d521b613 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_1000000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-1000000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.4_1687337566476.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.4_1687337566476.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_1000000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_1000000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_1000000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file From 0534c585f895ef3af4fa4717ed862648e26bf16c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:54:42 +0700 Subject: [PATCH 028/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de --- ..._base_gc4_64k_600000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md new file mode 100644 index 00000000000000..02adfa19883aa8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_600000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-600000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687337438102.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687337438102.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_600000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_600000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_600000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file From 3c13d8406b6aad71f0b94a67c5cd09fd9e18d598 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:55:42 +0700 Subject: [PATCH 029/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de --- ..._base_gc4_64k_400000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md new file mode 100644 index 00000000000000..45fedae9b51534 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_400000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-400000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687337577697.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687337577697.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_400000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_400000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_400000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file From 662939a3bfdf4366f04d293050682e4011b9843c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:56:42 +0700 Subject: [PATCH 030/149] Add model 2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko --- ...ngs_finance_koelectra_base_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md new file mode 100644 index 00000000000000..49fffd2af1f284 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from krevas) +author: John Snow Labs +name: electra_embeddings_finance_koelectra_base_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `finance-koelectra-base-generator` is a Korean model orginally trained by `krevas`. This is a Base model. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.4_1687337679070.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.4_1687337679070.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_finance_koelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|129.1 MB| +|Case sensitive:|true| \ No newline at end of file From 795450eb57a18a91c5f24c3d2569322c5b40005a Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:57:42 +0700 Subject: [PATCH 031/149] Add model 2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko --- ...beddings_koelectra_base_v2_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md new file mode 100644 index 00000000000000..f53d78f8396e90 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_v2_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-v2-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.4_1687337792559.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.4_1687337792559.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_v2_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|129.7 MB| +|Case sensitive:|true| \ No newline at end of file From 06585333b80d2614890c049814c3db2b7724fc7e Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:58:43 +0700 Subject: [PATCH 032/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de --- ..._base_gc4_64k_300000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md new file mode 100644 index 00000000000000..1d5f7c779e4a9b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_300000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-300000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.4_1687337742127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.4_1687337742127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_300000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_300000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_300000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file From 4ee0f36733790a71c36f08832c904f6c897d02c6 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 15:59:43 +0700 Subject: [PATCH 033/149] Add model 2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr --- ...tra_base_turkish_mc4_cased_generator_tr.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md new file mode 100644 index 00000000000000..604b5b0ec2c299 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Turkish Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_turkish_mc4_cased_generator +date: 2023-06-21 +tags: [tr, open_source, electra, embeddings, onnx] +task: Embeddings +language: tr +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-turkish-mc4-cased-generator` is a Turkish model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.4_1687337596423.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.4_1687337596423.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.cased_base").predict("""Spark NLP'yi seviyorum""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.cased_base").predict("""Spark NLP'yi seviyorum""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_turkish_mc4_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tr| +|Size:|129.9 MB| +|Case sensitive:|true| \ No newline at end of file From bcdb2d7352edb65dc6eec21f426e1c3465ec17d9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:00:43 +0700 Subject: [PATCH 034/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de --- ...ectra_base_gc4_64k_0_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md new file mode 100644 index 00000000000000..f31b3fb5e86370 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_0_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-0-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687337406782.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687337406782.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_64d").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_64d").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_0_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|221.4 MB| +|Case sensitive:|true| \ No newline at end of file From c28e4e4f0056de260640146d57fadafb0fee64d6 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:01:43 +0700 Subject: [PATCH 035/149] Add model 2023-06-21-electra_embeddings_electra_small_generator_en --- ...a_embeddings_electra_small_generator_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md new file mode 100644 index 00000000000000..65889ff972397c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_small_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-small-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.4_1687337729115.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.4_1687337729115.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|50.8 MB| +|Case sensitive:|true| \ No newline at end of file From 5f109a9e1544e8ce1295bbc9a3923945535c10ec Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:02:44 +0700 Subject: [PATCH 036/149] Add model 2023-06-21-electra_embeddings_electra_large_generator_en --- ...a_embeddings_electra_large_generator_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md new file mode 100644 index 00000000000000..ea273b2caeb31c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_large_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-large-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.4_1687337805375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.4_1687337805375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.large").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.large").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_large_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|191.2 MB| +|Case sensitive:|true| \ No newline at end of file From 01654d822f9cb8eb9a1201b9cb9aeba51c5f03b3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:03:44 +0700 Subject: [PATCH 037/149] Add model 2023-06-21-electra_embeddings_electricidad_base_generator_es --- ...beddings_electricidad_base_generator_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md new file mode 100644 index 00000000000000..f8d5fb802d26dc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Electra Uncased Embeddings (Oscar dataset) +author: John Snow Labs +name: electra_embeddings_electricidad_base_generator +date: 2023-06-21 +tags: [es, open_source, electra, embeddings, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electricidad-base-generator` is a Spanish model orginally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.4_1687337686007.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.4_1687337686007.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.electra.base").predict("""Amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.electra.base").predict("""Amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electricidad_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|es| +|Size:|126.1 MB| +|Case sensitive:|true| \ No newline at end of file From 9304071c3cda9c2450575f122033c06858f6e022 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:04:44 +0700 Subject: [PATCH 038/149] Add model 2023-06-21-electra_embeddings_gelectra_large_generator_de --- ..._embeddings_gelectra_large_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md new file mode 100644 index 00000000000000..719a9cd57ed8a0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from deepset) +author: John Snow Labs +name: electra_embeddings_gelectra_large_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `gelectra-large-generator` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.4_1687338033613.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.4_1687338033613.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.large").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.large").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_gelectra_large_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|193.5 MB| +|Case sensitive:|true| \ No newline at end of file From 6f7acf7c96dae699e87b2d2613c9675a63581635 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:05:44 +0700 Subject: [PATCH 039/149] Add model 2023-06-21-electra_embeddings_koelectra_base_generator_ko --- ..._embeddings_koelectra_base_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md new file mode 100644 index 00000000000000..42848be76dc939 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.4_1687337873576.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.4_1687337873576.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|130.1 MB| +|Case sensitive:|true| \ No newline at end of file From 2f82152d3c589d9e69bf4beab73d1bf0949d9c70 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:06:44 +0700 Subject: [PATCH 040/149] Add model 2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko --- ...beddings_koelectra_base_v3_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md new file mode 100644 index 00000000000000..0cf5183f230459 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_v3_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-v3-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.4_1687337798528.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.4_1687337798528.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_v3_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|137.3 MB| +|Case sensitive:|true| \ No newline at end of file From c57578a4fdea1ea41330aac7cf95f80c65ef28f8 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:07:45 +0700 Subject: [PATCH 041/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de --- ...ra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md index f31b3fb5e86370..e342e0a50c0654 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687337406782.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687337406782.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687338403600.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687338403600.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use From 285a2e10e959d3c90881216c6faf562b3f22b6c8 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:08:45 +0700 Subject: [PATCH 042/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de --- ..._base_gc4_64k_100000_cased_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md new file mode 100644 index 00000000000000..c94835e08e9c8a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_100000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-100000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.4_1687337430315.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.4_1687337430315.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_100000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_100000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_100000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.0 MB| +|Case sensitive:|true| \ No newline at end of file From 5527e0a2fbb42542e736a4d3fe9528f15a03e88d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:09:45 +0700 Subject: [PATCH 043/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de --- ...beddings_electra_base_gc4_64k_400000_cased_generator_de.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md index 45fedae9b51534..e7d78cc73de8e1 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687337577697.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687337577697.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687338531671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687338531671.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use From 438e6ee828302af114d87ad44ad83d84faf6977d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:10:45 +0700 Subject: [PATCH 044/149] Add model 2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de --- ...beddings_electra_base_gc4_64k_600000_cased_generator_de.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md index 02adfa19883aa8..f29eff6d36ec34 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687337438102.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687337438102.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687338289447.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687338289447.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use From 2b4c024eb2396c691e1631681c7a03acc5099036 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:11:45 +0700 Subject: [PATCH 045/149] Add model 2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl --- ...lectra_tagalog_small_cased_generator_tl.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md new file mode 100644 index 00000000000000..b132ce16178180 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_small_cased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-small-cased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.4_1687338628903.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.4_1687338628903.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_small").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_small").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_small_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|18.2 MB| +|Case sensitive:|true| \ No newline at end of file From 160797d96ed90a2ca79b74b006b0cfd6218cbfdd Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:12:45 +0700 Subject: [PATCH 046/149] Add model 2023-06-21-electra_embeddings_gelectra_base_generator_de --- ...a_embeddings_gelectra_base_generator_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md new file mode 100644 index 00000000000000..cd680ca947182c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from deepset) +author: John Snow Labs +name: electra_embeddings_gelectra_base_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `gelectra-base-generator` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.4_1687338626775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.4_1687338626775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.base").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.base").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_gelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|127.6 MB| +|Case sensitive:|true| \ No newline at end of file From 2247fb42aba2a31ed70c212f8828623b1de987af Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:13:45 +0700 Subject: [PATCH 047/149] Add model 2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl --- ...electra_tagalog_base_cased_generator_tl.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md new file mode 100644 index 00000000000000..0a11f3ced2e5f2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_base_cased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-base-cased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.4_1687338660491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.4_1687338660491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_base").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_base").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_base_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|129.9 MB| +|Case sensitive:|true| \ No newline at end of file From ad9e9a2f1429c58fd9a490eab36f610711bba951 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:14:45 +0700 Subject: [PATCH 048/149] Add model 2023-06-21-bert_sentence_embeddings_financial_de --- ...1-bert_sentence_embeddings_financial_de.md | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md new file mode 100644 index 00000000000000..f7a957e5d314da --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md @@ -0,0 +1,151 @@ +--- +layout: model +title: German Financial Bert Word Embeddings +author: John Snow Labs +name: bert_sentence_embeddings_financial +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, financial, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Although in the name of the model you will see the word `sentence`, this is a Word Embeddings Model. + +Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `german-financial-statements-bert` is a German model orginally trained by `fabianrausch`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.4_1687338810949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.4_1687338810949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert.finance").predict("""Ich liebe Spark-NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert.finance").predict("""Ich liebe Spark-NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_sentence_embeddings_financial| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file From 1ab946ad24a196f9b613945d3bee974ceb2c5da3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:15:46 +0700 Subject: [PATCH 049/149] Add model 2023-06-21-electra_embeddings_electra_small_japanese_generator_ja --- ...ngs_electra_small_japanese_generator_ja.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md new file mode 100644 index 00000000000000..6944eeb7698db4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Japanese Electra Embeddings (from Cinnamon) +author: John Snow Labs +name: electra_embeddings_electra_small_japanese_generator +date: 2023-06-21 +tags: [ja, open_source, electra, embeddings, onnx] +task: Embeddings +language: ja +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-small-japanese-generator` is a Japanese model orginally trained by `Cinnamon`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.4_1687338737717.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.4_1687338737717.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLPが大好きです"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLPが大好きです").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLPが大好きです"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLPが大好きです").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_small_japanese_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ja| +|Size:|51.7 MB| +|Case sensitive:|true| \ No newline at end of file From 784513b2b0295a12bf9dc7ba51765c8aac4dc1e5 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:16:46 +0700 Subject: [PATCH 050/149] Add model 2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl --- ...ectra_tagalog_base_uncased_generator_tl.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md new file mode 100644 index 00000000000000..cb0c52829db71c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_base_uncased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-base-uncased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.4_1687338703736.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.4_1687338703736.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_base").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_base").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_base_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|129.9 MB| +|Case sensitive:|false| \ No newline at end of file From 52d807302e29be98787baca69044c1ba0e9eb3ef Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:17:46 +0700 Subject: [PATCH 051/149] Add model 2023-06-21-electra_embeddings_koelectra_small_generator_ko --- ...embeddings_koelectra_small_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md new file mode 100644 index 00000000000000..f5881a5cbb4be8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_small_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-small-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.4_1687338723919.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.4_1687338723919.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|51.7 MB| +|Case sensitive:|true| \ No newline at end of file From 8f7245cea2d3af184461fd4105caa884e362ff86 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:18:46 +0700 Subject: [PATCH 052/149] Add model 2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko --- ...gs_finance_koelectra_small_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md new file mode 100644 index 00000000000000..5955e86f590f3a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from krevas) +author: John Snow Labs +name: electra_embeddings_finance_koelectra_small_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `finance-koelectra-small-generator` is a Korean model orginally trained by `krevas`. This is a small (sm) version. Other bigger versions are available. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.4_1687338677896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.4_1687338677896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_finance_koelectra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|51.5 MB| +|Case sensitive:|true| \ No newline at end of file From 86cb08b2d7f19874acdc2920be4ba88b262a52dc Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:19:46 +0700 Subject: [PATCH 053/149] Add model 2023-06-21-bert_embeddings_sec_bert_base_en --- ...-06-21-bert_embeddings_sec_bert_base_en.md | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md new file mode 100644 index 00000000000000..2be531ad5a4f8d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md @@ -0,0 +1,154 @@ +--- +layout: model +title: Financial English BERT Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_sec_bert_base +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, financial, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sec-bert-base` is a English model orginally trained by `nlpaueb`. This is the reference base model, what means it uses the same architecture as BERT-BASE trained on financial documents. + +If you are interested in Financial Embeddings, take a look also at these two models: + +- [sec-num](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_num_en_3_0.html): Same as this base model but we replace every number token with a [NUM] pseudo-token handling all numeric expressions in a uniform manner, disallowing their fragmentation). +- [sec-shape](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_sh_en_3_0.html): Same as this base model but we replace numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented, e.g., '53.2' becomes '[XX.X]' and '40,200.5' becomes '[XX,XXX.X]'. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.4_1687339042219.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.4_1687339042219.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sec_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.5 MB| +|Case sensitive:|true| \ No newline at end of file From 054823d990add5f5461702789d35bdd507acc5ea Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:20:46 +0700 Subject: [PATCH 054/149] Add model 2023-06-21-electra_embeddings_kr_electra_generator_ko --- ...ctra_embeddings_kr_electra_generator_ko.md | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md new file mode 100644 index 00000000000000..146c04252c2129 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from snunlp) +author: John Snow Labs +name: electra_embeddings_kr_electra_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `KR-ELECTRA-generator` is a Korean model orginally trained by `snunlp`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.4_1687338860027.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.4_1687338860027.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_kr_electra_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|124.1 MB| +|Case sensitive:|true| \ No newline at end of file From a9d7101678009171a2e12f6f98a3960fd51ffca4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:21:46 +0700 Subject: [PATCH 055/149] Add model 2023-06-21-bert_embeddings_sec_bert_sh_en --- ...23-06-21-bert_embeddings_sec_bert_sh_en.md | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md new file mode 100644 index 00000000000000..67cc5c1d89bcf3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md @@ -0,0 +1,155 @@ +--- +layout: model +title: Financial English BERT Embeddings (Number shape masking) +author: John Snow Labs +name: bert_embeddings_sec_bert_sh +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, financial, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sec-bert-shape` is a English model orginally trained by `nlpaueb`.This model is the same as Bert Base but we replace numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented, e.g., '53.2' becomes '[XX.X]' and '40,200.5' becomes '[XX,XXX.X]'. + +If you are interested in Financial Embeddings, take a look also at these two models: + +- [sec-base](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_base_en_3_0.html): Same as BERT Base but trained with financial documents. +- [sec-num](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_num_en_3_0.html): Same as Bert sec-base but we replace every number token with a [NUM] pseudo-token handling all numeric expressions in a uniform manner, disallowing their fragmentation). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.4_1687339128341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.4_1687339128341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_sh").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_sh").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sec_bert_sh| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.6 MB| +|Case sensitive:|true| \ No newline at end of file From 027d7b9bd12e8ce5bf85ac4ab8b8d23b111ee124 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:22:46 +0700 Subject: [PATCH 056/149] Add model 2023-06-21-bert_embeddings_german_financial_statements_bert_de --- ...ngs_german_financial_statements_bert_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md new file mode 100644 index 00000000000000..6a53f96a8fc12e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Financial Bert Word Embeddings +author: John Snow Labs +name: bert_embeddings_german_financial_statements_bert +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Bert Word Embeddings model, trained on German Financial Statements. Uploaded to Hugging Face, adapted and imported into Spark NLP. `german-financial-statements-bert` is a German Financial model orginally trained upon 100,000 natural language annual financial statements. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.4_1687339007310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.4_1687339007310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.german_financial_statements_bert").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.german_financial_statements_bert").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_german_financial_statements_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file From 5ee0ec93b122c8e337aad37aa97754f4703b2b41 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:23:46 +0700 Subject: [PATCH 057/149] Add model 2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl --- ...ctra_tagalog_small_uncased_generator_tl.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md new file mode 100644 index 00000000000000..8962b0c8151a21 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_small_uncased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-small-uncased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.4_1687338586547.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.4_1687338586547.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_small").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_small").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_small_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|18.2 MB| +|Case sensitive:|false| \ No newline at end of file From b35ee1e3938b3acdbf6a209d34a6d5b34801dc6b Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:24:47 +0700 Subject: [PATCH 058/149] Add model 2023-06-21-bert_embeddings_javanese_bert_small_jv --- ...-bert_embeddings_javanese_bert_small_jv.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md new file mode 100644 index 00000000000000..424b3c45fbd045 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese Bert Embeddings (Small, Wikipedia) +author: John Snow Labs +name: bert_embeddings_javanese_bert_small +date: 2023-06-21 +tags: [bert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-bert-small` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.4_1687339377809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.4_1687339377809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_javanese_bert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|407.3 MB| +|Case sensitive:|true| \ No newline at end of file From 3c8f957a68a8e46858c92afa7835ef0833a46fd8 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:25:47 +0700 Subject: [PATCH 059/149] Add model 2023-06-21-bert_embeddings_finest_bert_en --- ...23-06-21-bert_embeddings_finest_bert_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md new file mode 100644 index 00000000000000..4ab11423e6fe62 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Multilingual (Finnish, Estonian, English) Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_finest_bert +date: 2023-06-21 +tags: [bert, embeddings, fi, et, en, xx, multilingual, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `finest-bert` is a English model orginally trained by `EMBEDDIA`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.4_1687339089124.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.4_1687339089124.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finest_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finest_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_finest_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|535.1 MB| +|Case sensitive:|true| \ No newline at end of file From 6f4b142454c86c7e148e39757fa2751a9beaa466 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:26:47 +0700 Subject: [PATCH 060/149] Add model 2023-06-21-bert_embeddings_indic_transformers_te_bert_te --- ...mbeddings_indic_transformers_te_bert_te.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md new file mode 100644 index 00000000000000..a19eaffd72a24a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu Bert Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: bert_embeddings_indic_transformers_te_bert +date: 2023-06-21 +tags: [bert, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-te-bert` is a Telugu model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.4_1687339517415.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.4_1687339517415.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_bert").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_bert").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_indic_transformers_te_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|609.1 MB| +|Case sensitive:|true| \ No newline at end of file From 85e7b3022f18287f002aaef19eb6d802bb148968 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:28:59 +0700 Subject: [PATCH 061/149] Add model 2023-06-21-bert_embeddings_gbert_base_de --- ...023-06-21-bert_embeddings_gbert_base_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md new file mode 100644 index 00000000000000..e142d8aa5e7623 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (Base, Cased) +author: John Snow Labs +name: bert_embeddings_gbert_base +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `gbert-base` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.4_1687339723694.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.4_1687339723694.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.gbert_base").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.gbert_base").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_gbert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|409.7 MB| +|Case sensitive:|true| \ No newline at end of file From 39aaf3264f6669bd8e6b580be8c33b32c3d66333 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:33:07 +0700 Subject: [PATCH 062/149] Add model 2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi --- ...mbeddings_indic_transformers_hi_bert_hi.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md new file mode 100644 index 00000000000000..4f0ec15e3eda26 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi Bert Embeddings +author: John Snow Labs +name: bert_embeddings_indic_transformers_hi_bert +date: 2023-06-21 +tags: [bert, embeddings, hi, open_source, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-hi-bert` is a Hindi model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.4_1687339963111.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.4_1687339963111.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_bert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_bert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_indic_transformers_hi_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|hi| +|Size:|609.2 MB| +|Case sensitive:|true| \ No newline at end of file From d99bad4bd73cddaa00dcb03bbc4fdd549c1a2f51 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:35:39 +0700 Subject: [PATCH 063/149] Add model 2023-06-21-bert_embeddings_hateBERT_en --- .../2023-06-21-bert_embeddings_hateBERT_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md new file mode 100644 index 00000000000000..5d95f85bb63367 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from GroNLP) +author: John Snow Labs +name: bert_embeddings_hateBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `hateBERT` is a English model orginally trained by `GroNLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.4_1687340123478.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.4_1687340123478.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.hateBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.hateBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_hateBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.1 MB| +|Case sensitive:|true| \ No newline at end of file From 98605324816b5ec6bccf40e0fc38d1aeb50c5ab9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:36:39 +0700 Subject: [PATCH 064/149] Add model 2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en --- ...ives_scancode_bert_base_uncased_L8_1_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md new file mode 100644 index 00000000000000..8bee08b62d0c53 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Uncased) +author: John Snow Labs +name: bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `false-positives-scancode-bert-base-uncased-L8-1` is a English model orginally trained by `ayansinha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en_5.0.0_3.0_1687340166023.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en_5.0.0_3.0_1687340166023.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.false_positives_scancode_bert_base_uncased_L8_1").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.false_positives_scancode_bert_base_uncased_L8_1").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file From e5229fda6f738563aa093bc8ba5eae1d48c91271 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:41:38 +0700 Subject: [PATCH 065/149] Add model 2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en --- ...eddings_finbert_pretrain_yiyanghkust_en.md | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md new file mode 100644 index 00000000000000..1d9cfa6ac26325 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: Financial English Bert Embeddings (Base, Communication texts) +author: John Snow Labs +name: bert_embeddings_finbert_pretrain_yiyanghkust +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Financial English Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `finbert-pretrain-yiyanghkust` is a English model orginally available in Hugging Face as `yiyanghkust/finbert-pretrain`. It was trained on the following datasets: + +- Corporate Reports 10-K & 10-Q: 2.5B tokens +- Earnings Call Transcripts: 1.3B tokens +- Analyst Reports: 1.1B tokens + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340469127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340469127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finbert_pretrain_yiyanghkust").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finbert_pretrain_yiyanghkust").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_finbert_pretrain_yiyanghkust| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.4 MB| +|Case sensitive:|true| \ No newline at end of file From 75a9d5092d868de4b00bf02a074d987f75f29fa3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:42:38 +0700 Subject: [PATCH 066/149] Add model 2023-06-21-bert_embeddings_indic_transformers_te_bert_te --- ...3-06-21-bert_embeddings_indic_transformers_te_bert_te.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md index a19eaffd72a24a..286777564e9003 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, te, open_source, onnx] task: Embeddings language: te edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.4_1687339517415.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.4_1687339517415.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.0_1687340459352.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.0_1687340459352.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use From 02c30979105ef8a111189cb97856be97fab5fc2c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:46:39 +0700 Subject: [PATCH 067/149] Add model 2023-06-21-bert_embeddings_hseBert_it_cased_it --- ...-21-bert_embeddings_hseBert_it_cased_it.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md new file mode 100644 index 00000000000000..0ac43ca8f34236 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (from bullmount) +author: John Snow Labs +name: bert_embeddings_hseBert_it_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `hseBert-it-cased` is a Italian model orginally trained by `bullmount`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hseBert_it_cased_it_5.0.0_3.0_1687340783377.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hseBert_it_cased_it_5.0.0_3.0_1687340783377.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.hseBert_it_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.hseBert_it_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_hseBert_it_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file From 85b50bfc0e5b79abeb2bd02a0d569eccd3ef49fe Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:48:37 +0700 Subject: [PATCH 068/149] Add model 2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en --- ...3-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md index 1d9cfa6ac26325..a8f6efb7ae0030 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md @@ -32,8 +32,8 @@ Financial English Bert Embeddings model, uploaded to Hugging Face, adapted and i {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340469127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340469127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340890257.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340890257.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use From 95b26636756fd6767a216b13372f2d10fbbc8f29 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:49:38 +0700 Subject: [PATCH 069/149] Add model 2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es --- ..._spanish_question_encoder_allqa_base_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md new file mode 100644 index 00000000000000..fbf26507302d99 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Question, Allqa) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_question_encoder_allqa_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-question_encoder-allqa-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_allqa_base_es_5.0.0_3.0_1687340961201.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_allqa_base_es_5.0.0_3.0_1687340961201.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_question_encoder_allqa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file From 7d73b101bfdacad8048ca4bba792b3ce99f919c4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:52:14 +0700 Subject: [PATCH 070/149] Add model 2023-06-21-bert_embeddings_dziribert_ar --- ...2023-06-21-bert_embeddings_dziribert_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md new file mode 100644 index 00000000000000..4a1f66ee841cec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from alger-ia) +author: John Snow Labs +name: bert_embeddings_dziribert +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dziribert` is a Arabic model orginally trained by `alger-ia`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dziribert_ar_5.0.0_3.0_1687341113062.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dziribert_ar_5.0.0_3.0_1687341113062.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.dziribert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.dziribert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dziribert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|462.5 MB| +|Case sensitive:|true| \ No newline at end of file From 0b7f03063a79242f7e58083fdbfb31e47afd96f9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:53:14 +0700 Subject: [PATCH 071/149] Add model 2023-06-21-bert_embeddings_deberta_base_uncased_en --- ...bert_embeddings_deberta_base_uncased_en.md | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md new file mode 100644 index 00000000000000..22fae6bd9c819c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: English BertForMaskedLM Base Uncased model (from mlcorelib) +author: John Snow Labs +name: bert_embeddings_deberta_base_uncased +date: 2023-06-21 +tags: [en, open_source, bert_embeddings, bertformaskedlm, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-base-uncased` is a English model originally trained by `mlcorelib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_deberta_base_uncased_en_5.0.0_3.0_1687341134871.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_deberta_base_uncased_en_5.0.0_3.0_1687341134871.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_base_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_base_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_deberta_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file From c448cc6b57dde009692ddc87d6b5734d269c1288 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:54:14 +0700 Subject: [PATCH 072/149] Add model 2023-06-21-bert_embeddings_dbert_ko --- .../2023-06-21-bert_embeddings_dbert_ko.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md new file mode 100644 index 00000000000000..0c6f71e2a0ddb4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings (from deeq) +author: John Snow Labs +name: bert_embeddings_dbert +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dbert` is a Korean model orginally trained by `deeq`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dbert_ko_5.0.0_3.0_1687341138674.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dbert_ko_5.0.0_3.0_1687341138674.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.dbert").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.dbert").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|421.2 MB| +|Case sensitive:|true| \ No newline at end of file From 5cc53b26259441c3bb9f35a02f779a789a321f9c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:55:14 +0700 Subject: [PATCH 073/149] Add model 2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv --- ..._embeddings_javanese_bert_small_imdb_jv.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md new file mode 100644 index 00000000000000..9f834ccdc83fa0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese Bert Embeddings (Small, Imdb) +author: John Snow Labs +name: bert_embeddings_javanese_bert_small_imdb +date: 2023-06-21 +tags: [bert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-bert-small-imdb` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_imdb_jv_5.0.0_3.0_1687341195384.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_imdb_jv_5.0.0_3.0_1687341195384.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small_imdb").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small_imdb").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_javanese_bert_small_imdb| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|407.3 MB| +|Case sensitive:|true| \ No newline at end of file From 90b986a15770e2c5b714934dc1c8f7bb1e7c1cb5 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:56:14 +0700 Subject: [PATCH 074/149] Add model 2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es --- ...spanish_passage_encoder_squades_base_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md new file mode 100644 index 00000000000000..5b4101b177a992 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Pasage, Squades) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_passage_encoder_squades_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-passage_encoder-squades-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_squades_base_es_5.0.0_3.0_1687341276775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_squades_base_es_5.0.0_3.0_1687341276775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_passage_encoder_squades_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file From 395fd62762d2416c1a9e27a5359262b5dcc42451 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:57:58 +0700 Subject: [PATCH 075/149] Add model 2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es --- ...panish_question_encoder_squades_base_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md new file mode 100644 index 00000000000000..537f94ae494528 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Question, Squades) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_question_encoder_squades_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-question_encoder-squades-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_squades_base_es_5.0.0_3.0_1687341460131.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_squades_base_es_5.0.0_3.0_1687341460131.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_question_encoder_squades_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file From eb26168798df93e9878b067303c47a267a421b2d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 16:58:58 +0700 Subject: [PATCH 076/149] Add model 2023-06-21-bert_embeddings_crosloengual_bert_en --- ...21-bert_embeddings_crosloengual_bert_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md new file mode 100644 index 00000000000000..89a6839a95ee56 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Multilingual (Croatian, Slovenian, English) Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_crosloengual_bert +date: 2023-06-21 +tags: [bert, embeddings, en, hr, sl, xx, multilingual, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `crosloengual-bert` is a English model orginally trained by `EMBEDDIA`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_crosloengual_bert_en_5.0.0_3.0_1687341501117.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_crosloengual_bert_en_5.0.0_3.0_1687341501117.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.crosloengual_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.crosloengual_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_crosloengual_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|463.4 MB| +|Case sensitive:|true| \ No newline at end of file From 464231155660cae24c503d14cb151ee27a5c8eab Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:04:27 +0700 Subject: [PATCH 077/149] Add model 2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en --- ...ddings_clinical_pubmed_bert_base_512_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md new file mode 100644 index 00000000000000..2c83ed64e84a41 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical English Bert Embeddings (Base, 512 dimension) +author: John Snow Labs +name: bert_embeddings_clinical_pubmed_bert_base_512 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `clinical-pubmed-bert-base-512` is a English model orginally trained by `Tsubasaz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_512_en_5.0.0_3.0_1687341838471.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_512_en_5.0.0_3.0_1687341838471.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_512").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_512").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_clinical_pubmed_bert_base_512| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.0 MB| +|Case sensitive:|true| \ No newline at end of file From e8ac13e22f1ddd5f6b7ee6e8eb818c25cf8a7b3d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:05:27 +0700 Subject: [PATCH 078/149] Add model 2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es --- ...r_spanish_passage_encoder_allqa_base_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md new file mode 100644 index 00000000000000..622e96cea211e0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Pasage, Allqa) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_passage_encoder_allqa_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-passage_encoder-allqa-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es_5.0.0_3.0_1687341854288.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es_5.0.0_3.0_1687341854288.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_passage_encoder_allqa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file From a146658c16870b4920f28596a1bb5fd13dafe0ea Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:06:38 +0700 Subject: [PATCH 079/149] Add model 2023-06-21-bert_embeddings_legal_bert_base_uncased_en --- ...t_embeddings_legal_bert_base_uncased_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md new file mode 100644 index 00000000000000..a490d46ea569a7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal English Bert Embeddings (Base, Uncased) +author: John Snow Labs +name: bert_embeddings_legal_bert_base_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, legal, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Legal Pretrained Bert Embeddings model, trained with uncased text, uploaded to Hugging Face, adapted and imported into Spark NLP. `legal-bert-base-uncased` is a English model orginally trained by `nlpaueb`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legal_bert_base_uncased_en_5.0.0_3.0_1687341978829.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legal_bert_base_uncased_en_5.0.0_3.0_1687341978829.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_bert_base_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_bert_base_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_legal_bert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|false| \ No newline at end of file From 3fbe05d66e8c5717ef071dc53822069c61168f99 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:13:37 +0700 Subject: [PATCH 080/149] Add model 2023-06-21-biobert_embeddings_all_pt --- .../2023-06-21-biobert_embeddings_all_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md new file mode 100644 index 00000000000000..a38274560def80 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddings (Biomedical and Clinical) +author: John Snow Labs +name: biobert_embeddings_all +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-all` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_all_pt_5.0.0_3.0_1687342387740.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_all_pt_5.0.0_3.0_1687342387740.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_all").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_all").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_all| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|664.8 MB| +|Case sensitive:|true| \ No newline at end of file From a45a28a84147ae8f589f0507c2814535da40e7bd Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:14:37 +0700 Subject: [PATCH 081/149] Add model 2023-06-21-bert_embeddings_wineberto_italian_cased_it --- ...t_embeddings_wineberto_italian_cased_it.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md new file mode 100644 index 00000000000000..3984e73e05bfad --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Embeddings (Base, Wines description) +author: John Snow Labs +name: bert_embeddings_wineberto_italian_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `wineberto-italian-cased` is a Italian model orginally trained by `vinhood`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687342408297.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687342408297.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.wineberto_italian_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.wineberto_italian_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_wineberto_italian_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file From d46cf99329d2f556c59d49b192c2854d88f28a61 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:18:05 +0700 Subject: [PATCH 082/149] Add model 2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en --- ...ddings_clinical_pubmed_bert_base_128_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md new file mode 100644 index 00000000000000..ecc100c7def6f7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical English Bert Embeddings (Base, 128 dimension) +author: John Snow Labs +name: bert_embeddings_clinical_pubmed_bert_base_128 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `clinical-pubmed-bert-base-128` is a English model orginally trained by `Tsubasaz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_128_en_5.0.0_3.0_1687342663053.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_128_en_5.0.0_3.0_1687342663053.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_128").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_128").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_clinical_pubmed_bert_base_128| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.0 MB| +|Case sensitive:|true| \ No newline at end of file From 648a8fb32a3d2da7772cb85795823a78001f5571 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:21:58 +0700 Subject: [PATCH 083/149] Add model 2023-06-21-biobert_embeddings_clinical_pt --- ...23-06-21-biobert_embeddings_clinical_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md new file mode 100644 index 00000000000000..d8a80c64c56b34 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddiongs (Clinical) +author: John Snow Labs +name: biobert_embeddings_clinical +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-clin` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_clinical_pt_5.0.0_3.0_1687342893170.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_clinical_pt_5.0.0_3.0_1687342893170.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_clinical").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_clinical").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_clinical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|665.0 MB| +|Case sensitive:|true| \ No newline at end of file From 20df8e463e503ed8791f14293add9db69cdad287 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:24:04 +0700 Subject: [PATCH 084/149] Add model 2023-06-21-bert_embeddings_telugu_bertu_te --- ...3-06-21-bert_embeddings_telugu_bertu_te.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md new file mode 100644 index 00000000000000..a521a954fb4236 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu Bert Embeddings +author: John Snow Labs +name: bert_embeddings_telugu_bertu +date: 2023-06-21 +tags: [bert, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `telugu_bertu` is a Telugu model orginally trained by `kuppuluri`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_telugu_bertu_te_5.0.0_3.0_1687343021533.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_telugu_bertu_te_5.0.0_3.0_1687343021533.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.telugu_bertu").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.telugu_bertu").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_telugu_bertu| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|412.5 MB| +|Case sensitive:|true| \ No newline at end of file From 23a74d253ca085be8aa0204f7bc7e7e066bae8a7 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:26:44 +0700 Subject: [PATCH 085/149] Add model 2023-06-21-bert_embeddings_wobert_chinese_plus_zh --- ...-bert_embeddings_wobert_chinese_plus_zh.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md new file mode 100644 index 00000000000000..698131409af3a5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (from qinluo) +author: John Snow Labs +name: bert_embeddings_wobert_chinese_plus +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `wobert-chinese-plus` is a Chinese model orginally trained by `qinluo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wobert_chinese_plus_zh_5.0.0_3.0_1687343185496.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wobert_chinese_plus_zh_5.0.0_3.0_1687343185496.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.wobert_chinese_plus").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.wobert_chinese_plus").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_wobert_chinese_plus| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|464.5 MB| +|Case sensitive:|true| \ No newline at end of file From 7e4706df8e324e15b54f402a16ee2ff270ebbddd Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:28:29 +0700 Subject: [PATCH 086/149] Add model 2023-06-21-bert_embeddings_wineberto_italian_cased_it --- .../2023-06-21-bert_embeddings_wineberto_italian_cased_it.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md index 3984e73e05bfad..64d487ee4850f5 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687342408297.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687342408297.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687343289463.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687343289463.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use From 4784a7d9fc74e461e33eb716683756b7b87da2fe Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:29:29 +0700 Subject: [PATCH 087/149] Add model 2023-06-21-bert_embeddings_sikuroberta_zh --- ...23-06-21-bert_embeddings_sikuroberta_zh.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md new file mode 100644 index 00000000000000..68d128c492b7cd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (from SIKU-BERT) +author: John Snow Labs +name: bert_embeddings_sikuroberta +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sikuroberta` is a Chinese model orginally trained by `SIKU-BERT`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikuroberta_zh_5.0.0_3.0_1687343322944.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikuroberta_zh_5.0.0_3.0_1687343322944.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikuroberta").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikuroberta").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sikuroberta| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file From 279be879874eb71c68447b67117438efceb3a174 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:30:30 +0700 Subject: [PATCH 088/149] Add model 2023-06-21-biobert_embeddings_biomedical_pt --- ...-06-21-biobert_embeddings_biomedical_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md new file mode 100644 index 00000000000000..b86fe840106cfa --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddiongs (Biomedical) +author: John Snow Labs +name: biobert_embeddings_biomedical +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-bio` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_biomedical_pt_5.0.0_3.0_1687343400949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_biomedical_pt_5.0.0_3.0_1687343400949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_biomedical").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_biomedical").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_biomedical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|665.0 MB| +|Case sensitive:|true| \ No newline at end of file From ea76aebd1a4cd7ee0bda20f4a41b19f6e23f802c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 17:36:03 +0700 Subject: [PATCH 089/149] Add model 2023-06-21-bert_embeddings_sikubert_zh --- .../2023-06-21-bert_embeddings_sikubert_zh.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md new file mode 100644 index 00000000000000..ce8ab67067aa3f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (Siku Quanshu corpus) +author: John Snow Labs +name: bert_embeddings_sikubert +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sikubert` is a Chinese model orginally trained by `SIKU-BERT`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikubert_zh_5.0.0_3.0_1687343740087.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikubert_zh_5.0.0_3.0_1687343740087.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikubert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikubert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sikubert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|406.0 MB| +|Case sensitive:|true| \ No newline at end of file From 03c18aff366135d9510f5b46fe23bd2d214038d1 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 19:33:03 +0700 Subject: [PATCH 090/149] Add model 2023-06-21-bert_embeddings_psych_search_en --- ...3-06-21-bert_embeddings_psych_search_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md new file mode 100644 index 00000000000000..f12d896dcbc372 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from nlp4good) +author: John Snow Labs +name: bert_embeddings_psych_search +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `psych-search` is a English model orginally trained by `nlp4good`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_psych_search_en_5.0.0_3.0_1687350768319.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_psych_search_en_5.0.0_3.0_1687350768319.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.psych_search").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.psych_search").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_psych_search| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file From 23ac4c09db8dacda389bd5875b956d4ba42fbf02 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 19:34:47 +0700 Subject: [PATCH 091/149] Add model 2023-06-21-bert_embeddings_marathi_bert_mr --- ...3-06-21-bert_embeddings_marathi_bert_mr.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md new file mode 100644 index 00000000000000..6b39af7927411f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Marathi Bert Embeddings +author: John Snow Labs +name: bert_embeddings_marathi_bert +date: 2023-06-21 +tags: [bert, embeddings, mr, open_source, onnx] +task: Embeddings +language: mr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `marathi-bert` is a Marathi model orginally trained by `l3cube-pune`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_marathi_bert_mr_5.0.0_3.0_1687350857061.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_marathi_bert_mr_5.0.0_3.0_1687350857061.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.marathi_bert").predict("""मला स्पार्क एनएलपी आवडते""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.marathi_bert").predict("""मला स्पार्क एनएलपी आवडते""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_marathi_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|mr| +|Size:|665.1 MB| +|Case sensitive:|true| \ No newline at end of file From a84c7decb7a0a0fd7efe58ed0ad932c1486058da Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 19:37:19 +0700 Subject: [PATCH 092/149] Add model 2023-06-21-bert_embeddings_netbert_en --- .../2023-06-21-bert_embeddings_netbert_en.md | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md new file mode 100644 index 00000000000000..2edd51e298a1d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: English BertForMaskedLM Cased model (from antoinelouis) +author: John Snow Labs +name: bert_embeddings_netbert +date: 2023-06-21 +tags: [en, open_source, bert_embeddings, bertformaskedlm, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `netbert` is a English model originally trained by `antoinelouis`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_netbert_en_5.0.0_3.0_1687351022341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_netbert_en_5.0.0_3.0_1687351022341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.netbert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.netbert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_netbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.1 MB| +|Case sensitive:|true| \ No newline at end of file From 9ff2565238c78a29c4caab43f71e733242c8e086 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 19:40:05 +0700 Subject: [PATCH 093/149] Add model 2023-06-21-bert_embeddings_mbert_ar_c19_ar --- ...3-06-21-bert_embeddings_mbert_ar_c19_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md new file mode 100644 index 00000000000000..3d6a26c549f735 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Mbert model, Covid-19) +author: John Snow Labs +name: bert_embeddings_mbert_ar_c19 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `mbert_ar_c19` is a Arabic model orginally trained by `moha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_mbert_ar_c19_ar_5.0.0_3.0_1687351164607.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_mbert_ar_c19_ar_5.0.0_3.0_1687351164607.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.mbert_ar_c19").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.mbert_ar_c19").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_mbert_ar_c19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|624.7 MB| +|Case sensitive:|true| \ No newline at end of file From 56f121f4ef86d3b9fe4d756bea805397be7f15c0 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 19:41:05 +0700 Subject: [PATCH 094/149] Add model 2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar --- ...dings_multi_dialect_bert_base_arabic_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md new file mode 100644 index 00000000000000..bc40d7959ba191 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from bashar-talafha) +author: John Snow Labs +name: bert_embeddings_multi_dialect_bert_base_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `multi-dialect-bert-base-arabic` is a Arabic model orginally trained by `bashar-talafha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_multi_dialect_bert_base_arabic_ar_5.0.0_3.0_1687351229326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_multi_dialect_bert_base_arabic_ar_5.0.0_3.0_1687351229326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.multi_dialect_bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.multi_dialect_bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_multi_dialect_bert_base_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|411.5 MB| +|Case sensitive:|true| \ No newline at end of file From b8f3ba88baa04c0e8d613c258ad93f1ec1a2f5ba Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 19:46:53 +0700 Subject: [PATCH 095/149] Add model 2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en --- ...class_scancode_bert_base_cased_L32_1_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md new file mode 100644 index 00000000000000..6e0309525b771a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_lic_class_scancode_bert_base_cased_L32_1 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `lic-class-scancode-bert-base-cased-L32-1` is a English model orginally trained by `ayansinha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en_5.0.0_3.0_1687351576851.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en_5.0.0_3.0_1687351576851.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.lic_class_scancode_bert_base_cased_L32_1").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.lic_class_scancode_bert_base_cased_L32_1").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_lic_class_scancode_bert_base_cased_L32_1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file From dfa197c91adfab23a2706d37a060594eea1253d7 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 20:39:44 +0700 Subject: [PATCH 096/149] Add model 2023-06-21-bert_embeddings_MARBERTv2_ar --- ...2023-06-21-bert_embeddings_MARBERTv2_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md new file mode 100644 index 00000000000000..26d222b5c236ee --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (MARBERT model v2) +author: John Snow Labs +name: bert_embeddings_MARBERTv2 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MARBERTv2` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERTv2_ar_5.0.0_3.0_1687354749271.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERTv2_ar_5.0.0_3.0_1687354749271.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERTv2").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERTv2").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_MARBERTv2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|606.5 MB| +|Case sensitive:|true| \ No newline at end of file From 526f65d7668bc8d47d3b2109240a384c0b60a8b5 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 20:42:55 +0700 Subject: [PATCH 097/149] Add model 2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt --- ...mbeddings_bert_base_cased_pt_lenerbr_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md new file mode 100644 index 00000000000000..ae5920ac1cb795 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Legal Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_cased_pt_lenerbr +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-cased-pt-lenerbr` is a Portuguese model orginally trained by `pierreguillou`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_cased_pt_lenerbr_pt_5.0.0_3.0_1687354957150.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_cased_pt_lenerbr_pt_5.0.0_3.0_1687354957150.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_cased_pt_lenerbr").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_cased_pt_lenerbr").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_cased_pt_lenerbr| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file From 294eb23545e02874e165390717f5f0ab454ce5d0 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 20:45:07 +0700 Subject: [PATCH 098/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar --- ..._bert_base_arabic_camelbert_msa_half_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md new file mode 100644 index 00000000000000..1ef43767d97397 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a half of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_half +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-half` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_half_ar_5.0.0_3.0_1687355081033.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_half_ar_5.0.0_3.0_1687355081033.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_half").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_half").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_half| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file From fc1b3a2da8fcc66fe75f93e35d2cd3e5d421637d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 20:46:08 +0700 Subject: [PATCH 099/149] Add model 2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de --- ...ings_bert_base_german_cased_oldvocab_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md new file mode 100644 index 00000000000000..1f985340872ac9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (Base, Cased, Old Vocabulary) +author: John Snow Labs +name: bert_embeddings_bert_base_german_cased_oldvocab +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-german-cased-oldvocab` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_cased_oldvocab_de_5.0.0_3.0_1687355117712.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_cased_oldvocab_de_5.0.0_3.0_1687355117712.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_cased_oldvocab").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_cased_oldvocab").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_german_cased_oldvocab| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file From 868256f4a552add31f1aaa98efea01f44e02b50e Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 20:47:58 +0700 Subject: [PATCH 100/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar --- ...dings_bert_base_arabic_camelbert_msa_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md new file mode 100644 index 00000000000000..97bcdf7edd7fe2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_ar_5.0.0_3.0_1687355261025.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_ar_5.0.0_3.0_1687355261025.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file From 774b7a1e490c925bff19c99b678c6d9361aede75 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 23:53:36 +0700 Subject: [PATCH 101/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar --- ...ert_base_arabic_camelbert_msa_eighth_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md new file mode 100644 index 00000000000000..6d2176403c972d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on an eighth of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_eighth +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-eighth` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar_5.0.0_3.0_1687366398028.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar_5.0.0_3.0_1687366398028.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_eighth").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_eighth").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_eighth| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file From a224310304c9c9bbe4cf90b42f20d84df41a069c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 23:55:25 +0700 Subject: [PATCH 102/149] Add model 2023-06-21-bert_embeddings_bert_base_german_uncased_de --- ..._embeddings_bert_base_german_uncased_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md new file mode 100644 index 00000000000000..91fa66ad63e5d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bert_base_german_uncased +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-german-uncased` is a German model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_uncased_de_5.0.0_3.0_1687366506395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_uncased_de_5.0.0_3.0_1687366506395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_uncased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_uncased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_german_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file From 74b0c9b04a22629ba03f13972a7e99d1ac9fb6c6 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 23:56:25 +0700 Subject: [PATCH 103/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar --- ...rt_base_arabic_camelbert_msa_quarter_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md new file mode 100644 index 00000000000000..622c54ebe635f9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a quarter of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_quarter +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-quarter` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar_5.0.0_3.0_1687366524279.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar_5.0.0_3.0_1687366524279.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_quarter").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_quarter").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_quarter| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file From a25308c4c248e1ad272963b6e94a7a64650e2f4a Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 23:57:25 +0700 Subject: [PATCH 104/149] Add model 2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de --- ...bert_base_historical_german_rw_cased_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md new file mode 100644 index 00000000000000..4626d471479c49 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (from redewiedergabe) +author: John Snow Labs +name: bert_embeddings_bert_base_historical_german_rw_cased +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-historical-german-rw-cased` is a German model orginally trained by `redewiedergabe`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_historical_german_rw_cased_de_5.0.0_3.0_1687366604668.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_historical_german_rw_cased_de_5.0.0_3.0_1687366604668.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_historical_german_rw_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_historical_german_rw_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_historical_german_rw_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file From c19e5feeec057c4dfd910aea00763d20d00e79b9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 23:58:25 +0700 Subject: [PATCH 105/149] Add model 2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it --- ...ddings_bert_base_italian_xxl_uncased_it.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md new file mode 100644 index 00000000000000..8f1b62389ff59c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (Uncased) +author: John Snow Labs +name: bert_embeddings_bert_base_italian_xxl_uncased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-italian-xxl-uncased` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_uncased_it_5.0.0_3.0_1687366606479.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_uncased_it_5.0.0_3.0_1687366606479.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_uncased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_uncased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_italian_xxl_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file From 26869e683ee982c02374145c22e12f06a9ee805d Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Wed, 21 Jun 2023 23:59:25 +0700 Subject: [PATCH 106/149] Add model 2023-06-21-bert_embeddings_bert_base_arabertv2_ar --- ...-bert_embeddings_bert_base_arabertv2_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md new file mode 100644 index 00000000000000..28aa7881be3bad --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v2) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv2 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv2` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv2_ar_5.0.0_3.0_1687366696592.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv2_ar_5.0.0_3.0_1687366696592.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv2").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv2").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.8 MB| +|Case sensitive:|true| \ No newline at end of file From 540fbb8c30b8296264aeb60b7322d9a06662cf41 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:00:32 +0700 Subject: [PATCH 107/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar --- ..._base_arabic_camelbert_msa_sixteenth_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md new file mode 100644 index 00000000000000..efc6980941ceab --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a sixteenth of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-sixteenth` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar_5.0.0_3.0_1687366813331.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar_5.0.0_3.0_1687366813331.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_sixteenth").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_sixteenth").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.4 MB| +|Case sensitive:|true| \ No newline at end of file From 746964dad222f8506f619be89a9bc27c066436f4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:01:33 +0700 Subject: [PATCH 108/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar --- ...dings_bert_base_arabic_camelbert_mix_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md new file mode 100644 index 00000000000000..2db1ef256735f2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, DA-CA-MSA variants) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_mix +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-mix` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_mix_ar_5.0.0_3.0_1687366836156.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_mix_ar_5.0.0_3.0_1687366836156.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_mix").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_mix").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_mix| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.6 MB| +|Case sensitive:|true| \ No newline at end of file From 513512e7e2be6d9c2a49301f4250e2ffc1b070c1 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:04:17 +0700 Subject: [PATCH 109/149] Add model 2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it --- ...beddings_bert_base_italian_xxl_cased_it.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md new file mode 100644 index 00000000000000..ab7513a407cb6c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_italian_xxl_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-italian-xxl-cased` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_cased_it_5.0.0_3.0_1687367037078.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_cased_it_5.0.0_3.0_1687367037078.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_italian_xxl_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file From 198cc3bb664a730dbb42ed1068ccd3ec6bd1146c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:05:17 +0700 Subject: [PATCH 110/149] Add model 2023-06-21-bert_embeddings_bert_base_gl_cased_pt --- ...1-bert_embeddings_bert_base_gl_cased_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md new file mode 100644 index 00000000000000..5b67a75c4e936e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Bert Embeddings (Base, Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_gl_cased +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-gl-cased` is a Portuguese model orginally trained by `marcosgg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_gl_cased_pt_5.0.0_3.0_1687367086939.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_gl_cased_pt_5.0.0_3.0_1687367086939.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_gl_cased").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_gl_cased").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_gl_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|664.5 MB| +|Case sensitive:|true| \ No newline at end of file From 010ed3f27f14b31a56389df6d6eef1aeb83eca5a Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:09:04 +0700 Subject: [PATCH 111/149] Add model 2023-06-21-bert_embeddings_MARBERT_ar --- .../2023-06-21-bert_embeddings_MARBERT_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md new file mode 100644 index 00000000000000..df8e6020ce97b2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (MARBERT model) +author: John Snow Labs +name: bert_embeddings_MARBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MARBERT` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERT_ar_5.0.0_3.0_1687367317123.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERT_ar_5.0.0_3.0_1687367317123.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_MARBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|608.6 MB| +|Case sensitive:|true| \ No newline at end of file From f207f4cd1dc04abdeced392e3e35d6f9000d8525 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:10:20 +0700 Subject: [PATCH 112/149] Add model 2023-06-21-bert_embeddings_AraBertMo_base_V1_ar --- ...21-bert_embeddings_AraBertMo_base_V1_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md new file mode 100644 index 00000000000000..507d80ddf48d81 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from Ebtihal) +author: John Snow Labs +name: bert_embeddings_AraBertMo_base_V1 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `AraBertMo_base_V1` is a Arabic model orginally trained by `Ebtihal`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_AraBertMo_base_V1_ar_5.0.0_3.0_1687367402700.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_AraBertMo_base_V1_ar_5.0.0_3.0_1687367402700.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.AraBertMo_base_V1").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.AraBertMo_base_V1").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_AraBertMo_base_V1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|407.8 MB| +|Case sensitive:|true| \ No newline at end of file From 53669a475a0ad7587d1646853b3fe8671982bc65 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:12:17 +0700 Subject: [PATCH 113/149] Add model 2023-06-21-bert_embeddings_bert_base_arabic_ar --- ...-21-bert_embeddings_bert_base_arabic_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md new file mode 100644 index 00000000000000..ec1f8a7ed29f13 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_ar_5.0.0_3.0_1687367514433.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_ar_5.0.0_3.0_1687367514433.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|412.0 MB| +|Case sensitive:|true| \ No newline at end of file From c2bcf61593407724b48053f6b6c321b5387181a7 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:13:26 +0700 Subject: [PATCH 114/149] Add model 2023-06-21-bert_embeddings_DarijaBERT_ar --- ...023-06-21-bert_embeddings_DarijaBERT_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md new file mode 100644 index 00000000000000..019db2765f56c4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from Kamel) +author: John Snow Labs +name: bert_embeddings_DarijaBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `DarijaBERT` is a Arabic model orginally trained by `Kamel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_DarijaBERT_ar_5.0.0_3.0_1687367582690.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_DarijaBERT_ar_5.0.0_3.0_1687367582690.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.DarijaBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.DarijaBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_DarijaBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|551.5 MB| +|Case sensitive:|true| \ No newline at end of file From 9cb887475740ccdcb20347cbe64e7b5c7446cd28 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:15:33 +0700 Subject: [PATCH 115/149] Add model 2023-06-21-bert_embeddings_Ara_DialectBERT_ar --- ...6-21-bert_embeddings_Ara_DialectBERT_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md new file mode 100644 index 00000000000000..aa25c8bbbd15f0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from MutazYoune) +author: John Snow Labs +name: bert_embeddings_Ara_DialectBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `Ara_DialectBERT` is a Arabic model orginally trained by `MutazYoune`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_Ara_DialectBERT_ar_5.0.0_3.0_1687367717615.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_Ara_DialectBERT_ar_5.0.0_3.0_1687367717615.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.Ara_DialectBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.Ara_DialectBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_Ara_DialectBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file From 4b2d52f3a31f93c8b8ae26e8d93e2fafc4bf5715 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:16:33 +0700 Subject: [PATCH 116/149] Add model 2023-06-21-bert_embeddings_German_MedBERT_de --- ...06-21-bert_embeddings_German_MedBERT_de.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md new file mode 100644 index 00000000000000..6f9f54bac44943 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Medical Bert Embeddings +author: John Snow Labs +name: bert_embeddings_German_MedBERT +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained German Medical Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `German-MedBERT` is a German model orginally trained by `smanjil`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_German_MedBERT_de_5.0.0_3.0_1687367757622.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_German_MedBERT_de_5.0.0_3.0_1687367757622.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.medbert").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.medbert").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_German_MedBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file From 9daf81431f0553de0db4663d46f8acd0057ff245 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:18:30 +0700 Subject: [PATCH 117/149] Add model 2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar --- ...eddings_bert_base_arabertv02_twitter_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md new file mode 100644 index 00000000000000..527a150609fdb3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v02, Twitter) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv02_twitter +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv02-twitter` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_twitter_ar_5.0.0_3.0_1687367879067.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_twitter_ar_5.0.0_3.0_1687367879067.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02_twitter").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02_twitter").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv02_twitter| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.0 MB| +|Case sensitive:|true| \ No newline at end of file From e2e72181e48ead087b7d4465b13724dc08f4625b Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:21:24 +0700 Subject: [PATCH 118/149] Add model 2023-06-21-bert_embeddings_FinancialBERT_en --- ...-06-21-bert_embeddings_FinancialBERT_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md new file mode 100644 index 00000000000000..f209c346ba5ecc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Financial Bert Embeddings +author: John Snow Labs +name: bert_embeddings_FinancialBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `FinancialBERT` is a English Financial model orginally trained on a very large corpus of financial texts including Earnings Calls, Corporate reports, Bloomberg News, TRC2-financial. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_FinancialBERT_en_5.0.0_3.0_1687368067375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_FinancialBERT_en_5.0.0_3.0_1687368067375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.FinancialBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.FinancialBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_FinancialBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.4 MB| +|Case sensitive:|true| \ No newline at end of file From 1f3ae39ae2932fa02e9dce7d6bb5d50a7a893f32 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:26:51 +0700 Subject: [PATCH 119/149] Add model 2023-06-21-bert_embeddings_ARBERT_ar --- .../2023-06-21-bert_embeddings_ARBERT_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md new file mode 100644 index 00000000000000..a8bde5a9373131 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (ARBERT model) +author: John Snow Labs +name: bert_embeddings_ARBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `ARBERT` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_ARBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|605.3 MB| +|Case sensitive:|true| \ No newline at end of file From 07fcedd300bbb53197d48b9af9f209f7364043b8 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:27:51 +0700 Subject: [PATCH 120/149] Add model 2023-06-21-bert_embeddings_COVID_SciBERT_en --- ...-06-21-bert_embeddings_COVID_SciBERT_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md new file mode 100644 index 00000000000000..1156a182032942 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from lordtt13) +author: John Snow Labs +name: bert_embeddings_COVID_SciBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `COVID-SciBERT` is a English model orginally trained by `lordtt13`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_COVID_SciBERT_en_5.0.0_3.0_1687368450114.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_COVID_SciBERT_en_5.0.0_3.0_1687368450114.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.COVID_SciBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.COVID_SciBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_COVID_SciBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|412.4 MB| +|Case sensitive:|true| \ No newline at end of file From 6fd0bbe9041b73a145c22eeed9d8183cbea0b730 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:29:41 +0700 Subject: [PATCH 121/149] Add model 2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es --- ...alberti_bert_base_multilingual_cased_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md new file mode 100644 index 00000000000000..334f6947d0078e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from flax-community) +author: John Snow Labs +name: bert_embeddings_alberti_bert_base_multilingual_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `alberti-bert-base-multilingual-cased` is a Spanish model orginally trained by `flax-community`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_alberti_bert_base_multilingual_cased_es_5.0.0_3.0_1687368551885.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_alberti_bert_base_multilingual_cased_es_5.0.0_3.0_1687368551885.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.alberti_bert_base_multilingual_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.alberti_bert_base_multilingual_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_alberti_bert_base_multilingual_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|664.3 MB| +|Case sensitive:|true| \ No newline at end of file From 064ed9072511a9ef52f32bd724b2bca1002dcc46 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:35:14 +0700 Subject: [PATCH 122/149] Add model 2023-06-21-bert_embeddings_agriculture_bert_uncased_en --- ..._embeddings_agriculture_bert_uncased_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md new file mode 100644 index 00000000000000..0f8dd6b3d732db --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Agriculture) +author: John Snow Labs +name: bert_embeddings_agriculture_bert_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `agriculture-bert-uncased` is a English model orginally trained by `recobo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_agriculture_bert_uncased_en_5.0.0_3.0_1687368891491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_agriculture_bert_uncased_en_5.0.0_3.0_1687368891491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.agriculture_bert_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.agriculture_bert_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_agriculture_bert_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file From fd3c776f4124f31f69c301944d0a66ffcb5e1c0f Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:37:18 +0700 Subject: [PATCH 123/149] Add model 2023-06-21-bert_embeddings_bangla_bert_bn --- ...23-06-21-bert_embeddings_bangla_bert_bn.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md new file mode 100644 index 00000000000000..27aac09a7bc754 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla Bert Embeddings (from Kowsher) +author: John Snow Labs +name: bert_embeddings_bangla_bert +date: 2023-06-21 +tags: [bert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bangla-bert` is a Bangla model orginally trained by `Kowsher`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_bn_5.0.0_3.0_1687369015466.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_bn_5.0.0_3.0_1687369015466.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangla_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangla_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bangla_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|612.1 MB| +|Case sensitive:|true| \ No newline at end of file From f5bb8958daf4a227c66437ac201abbc615f75df3 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:38:19 +0700 Subject: [PATCH 124/149] Add model 2023-06-21-bert_embeddings_bert_kor_base_ko --- ...-06-21-bert_embeddings_bert_kor_base_ko.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md new file mode 100644 index 00000000000000..cdf41dcad66847 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings (from kykim) +author: John Snow Labs +name: bert_embeddings_bert_kor_base +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-kor-base` is a Korean model orginally trained by `kykim`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_kor_base_ko_5.0.0_3.0_1687369025243.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_kor_base_ko_5.0.0_3.0_1687369025243.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert_kor_base").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert_kor_base").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_kor_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|441.2 MB| +|Case sensitive:|true| \ No newline at end of file From 750d7375d8fa1ff1412d3cd4c520db33a1865d29 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:39:19 +0700 Subject: [PATCH 125/149] Add model 2023-06-21-bert_embeddings_bert_base_arabertv02_ar --- ...bert_embeddings_bert_base_arabertv02_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md new file mode 100644 index 00000000000000..00a46a50c9de69 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v02) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv02 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv02` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_ar_5.0.0_3.0_1687369054270.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_ar_5.0.0_3.0_1687369054270.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv02| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.1 MB| +|Case sensitive:|true| \ No newline at end of file From 5ec17af47237d3f234685c402a126004f9a946c9 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:42:44 +0700 Subject: [PATCH 126/149] Add model 2023-06-21-bert_embeddings_arabert_c19_ar --- ...23-06-21-bert_embeddings_arabert_c19_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md new file mode 100644 index 00000000000000..d94b26d12cfe68 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Arabert model, Covid-19) +author: John Snow Labs +name: bert_embeddings_arabert_c19 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `arabert_c19` is a Arabic model orginally trained by `moha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_arabert_c19_ar_5.0.0_3.0_1687369343067.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_arabert_c19_ar_5.0.0_3.0_1687369343067.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arabert_c19").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arabert_c19").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_arabert_c19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file From eb415447a8e334be6c6d6a37ff023ec741467edd Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:54:54 +0700 Subject: [PATCH 127/149] Add model 2023-06-21-bert_embeddings_bert_base_5lang_cased_es --- ...ert_embeddings_bert_base_5lang_cased_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md new file mode 100644 index 00000000000000..99f607f5c1530b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from amine) +author: John Snow Labs +name: bert_embeddings_bert_base_5lang_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-5lang-cased` is a Spanish model orginally trained by `amine`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_5lang_cased_es_5.0.0_3.0_1687370074087.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_5lang_cased_es_5.0.0_3.0_1687370074087.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert_base_5lang_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert_base_5lang_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_5lang_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|461.1 MB| +|Case sensitive:|true| \ No newline at end of file From 18bcd064bf70abc1886a93898234c67bbc891a40 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:55:54 +0700 Subject: [PATCH 128/149] Add model 2023-06-21-bert_embeddings_bert_base_arabertv01_ar --- ...bert_embeddings_bert_base_arabertv01_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md new file mode 100644 index 00000000000000..b0ffc23c7e5de1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v01) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv01 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv01` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv01_ar_5.0.0_3.0_1687370107542.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv01_ar_5.0.0_3.0_1687370107542.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv01").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv01").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv01| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.0 MB| +|Case sensitive:|true| \ No newline at end of file From d3f289daf59c988568ead716bbe40403d8c4adb1 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:56:54 +0700 Subject: [PATCH 129/149] Add model 2023-06-21-bert_embeddings_bangla_bert_base_bn --- ...-21-bert_embeddings_bangla_bert_base_bn.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md new file mode 100644 index 00000000000000..05609a11fcf313 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bangla_bert_base +date: 2023-06-21 +tags: [bert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bangla-bert-base` is a Bangla model orginally trained by `sagorsarker`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_base_bn_5.0.0_3.0_1687370097955.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_base_bn_5.0.0_3.0_1687370097955.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangala_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangala_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bangla_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|614.7 MB| +|Case sensitive:|true| \ No newline at end of file From bb907662f826ade888272601989d69e14f2f7add Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:01:24 +0700 Subject: [PATCH 130/149] Add model 2023-06-21-bert_embeddings_bert_medium_arabic_ar --- ...1-bert_embeddings_bert_medium_arabic_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md new file mode 100644 index 00000000000000..e4ab0e46ec7075 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Medium) +author: John Snow Labs +name: bert_embeddings_bert_medium_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-medium-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_medium_arabic_ar_5.0.0_3.0_1687370471346.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_medium_arabic_ar_5.0.0_3.0_1687370471346.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_medium_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_medium_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_medium_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|157.2 MB| +|Case sensitive:|true| \ No newline at end of file From 45a9ed2648231546bd0434618c30ba7ccdbc48e5 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:02:25 +0700 Subject: [PATCH 131/149] Add model 2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en --- ...t_political_election2020_twitter_mlm_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md new file mode 100644 index 00000000000000..ce6a900f6c10fe --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from kornosk) +author: John Snow Labs +name: bert_embeddings_bert_political_election2020_twitter_mlm +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-political-election2020-twitter-mlm` is a English model orginally trained by `kornosk`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_political_election2020_twitter_mlm_en_5.0.0_3.0_1687370471142.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_political_election2020_twitter_mlm_en_5.0.0_3.0_1687370471142.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_political_election2020_twitter_mlm").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_political_election2020_twitter_mlm").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_political_election2020_twitter_mlm| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.6 MB| +|Case sensitive:|true| \ No newline at end of file From 08517c5f147d63c8c3d2f4a3a2c8d38492d5a1f4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:04:25 +0700 Subject: [PATCH 132/149] Add model 2023-06-21-bert_embeddings_bert_mini_arabic_ar --- ...-21-bert_embeddings_bert_mini_arabic_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md new file mode 100644 index 00000000000000..d4f0e3cce03c7e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Mini) +author: John Snow Labs +name: bert_embeddings_bert_mini_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-mini-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_mini_arabic_ar_5.0.0_3.0_1687370518080.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_mini_arabic_ar_5.0.0_3.0_1687370518080.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_mini_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_mini_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_mini_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|43.3 MB| +|Case sensitive:|true| \ No newline at end of file From 20eb1eddf1ba39bfdcf8dcac6a8df1137f80d846 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:06:29 +0700 Subject: [PATCH 133/149] Add model 2023-06-21-bert_embeddings_bert_base_arabert_ar --- ...21-bert_embeddings_bert_base_arabert_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md new file mode 100644 index 00000000000000..e8151fef004624 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model) +author: John Snow Labs +name: bert_embeddings_bert_base_arabert +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabert` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabert_ar_5.0.0_3.0_1687370767272.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabert_ar_5.0.0_3.0_1687370767272.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.6 MB| +|Case sensitive:|true| \ No newline at end of file From b7e03447d35092e0b8a94e31c23fe6678e546f80 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:08:57 +0700 Subject: [PATCH 134/149] Add model 2023-06-21-bert_embeddings_beto_gn_base_cased_es --- ...1-bert_embeddings_beto_gn_base_cased_es.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md new file mode 100644 index 00000000000000..a0e6b0632c7e43 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from mmaguero) +author: John Snow Labs +name: bert_embeddings_beto_gn_base_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `beto-gn-base-cased` is a Spanish model orginally trained by `mmaguero`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_beto_gn_base_cased_es_5.0.0_3.0_1687370922012.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_beto_gn_base_cased_es_5.0.0_3.0_1687370922012.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.beto_gn_base_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.beto_gn_base_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_beto_gn_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|408.6 MB| +|Case sensitive:|true| \ No newline at end of file From 3662115d7cb1eaaccc7d5060684c09558247c98a Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:09:58 +0700 Subject: [PATCH 135/149] Add model 2023-06-21-bert_embeddings_chemical_bert_uncased_en --- ...ert_embeddings_chemical_bert_uncased_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md new file mode 100644 index 00000000000000..8b4dcbb11e39e4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Chemical) +author: John Snow Labs +name: bert_embeddings_chemical_bert_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `chemical-bert-uncased` is a English model orginally trained by `recobo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_bert_uncased_en_5.0.0_3.0_1687370963306.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_bert_uncased_en_5.0.0_3.0_1687370963306.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.chemical_bert_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.chemical_bert_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_bert_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file From 11fe048b4e5585f51e34ef560e6c7ce8efccfb25 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:11:49 +0700 Subject: [PATCH 136/149] Add model 2023-06-21-bert_embeddings_bert_base_ko --- ...2023-06-21-bert_embeddings_bert_base_ko.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md new file mode 100644 index 00000000000000..586c43ffa392b7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bert_base +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base` is a Korean model orginally trained by `klue`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_ko_5.0.0_3.0_1687371079238.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_ko_5.0.0_3.0_1687371079238.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|412.4 MB| +|Case sensitive:|true| \ No newline at end of file From cabc59ae02f1c7bfff1ab11a4231faef14ca7f67 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:13:46 +0700 Subject: [PATCH 137/149] Add model 2023-06-21-bert_embeddings_chefberto_italian_cased_it --- ...t_embeddings_chefberto_italian_cased_it.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md new file mode 100644 index 00000000000000..1e2661ff475de6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Embeddings (Base, Recipees) +author: John Snow Labs +name: bert_embeddings_chefberto_italian_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `chefberto-italian-cased` is a Italian model orginally trained by `vinhood`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chefberto_italian_cased_it_5.0.0_3.0_1687371210449.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chefberto_italian_cased_it_5.0.0_3.0_1687371210449.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.chefberto_italian_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.chefberto_italian_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chefberto_italian_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file From 97dcd828337119a30e2f173a1751b955c1a85ca4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:14:46 +0700 Subject: [PATCH 138/149] Add model 2023-06-21-bert_embeddings_childes_bert_en --- ...3-06-21-bert_embeddings_childes_bert_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md new file mode 100644 index 00000000000000..60503fbe2496d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from smeylan) +author: John Snow Labs +name: bert_embeddings_childes_bert +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `childes-bert` is a English model orginally trained by `smeylan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_childes_bert_en_5.0.0_3.0_1687371245330.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_childes_bert_en_5.0.0_3.0_1687371245330.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.childes_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.childes_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_childes_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file From ea9b48bf21939d6484a625db944d627d9bc6a54c Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:15:46 +0700 Subject: [PATCH 139/149] Add model 2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt --- ..._portuguese_cased_finetuned_peticoes_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md new file mode 100644 index 00000000000000..1977eade89c812 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal Portuguese Embeddings (Base, Petitions) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased-finetuned-peticoes` is a Portuguese model orginally trained by `Luciano`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt_5.0.0_3.0_1687371316772.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt_5.0.0_3.0_1687371316772.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_peticoes").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_peticoes").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file From c4a442a3d640bc82ef2d39ca3eb1f45277b419ed Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:16:46 +0700 Subject: [PATCH 140/149] Add model 2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt --- ...tuguese_cased_finetuned_tcu_acordaos_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md new file mode 100644 index 00000000000000..51c244361ef7c6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal Portuguese Embeddings (Base, Agreements) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased-finetuned-tcu-acordaos` is a Portuguese model orginally trained by `Luciano`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt_5.0.0_3.0_1687371364352.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt_5.0.0_3.0_1687371364352.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file From c4d1765e016ffbf8e1ecf6fb7939710a9f4518c4 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:22:02 +0700 Subject: [PATCH 141/149] Add model 2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt --- ...mbeddings_bert_base_portuguese_cased_pt.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md new file mode 100644 index 00000000000000..7e8a494e05a9ea --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased` is a Portuguese model orginally trained by `neuralmind`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_pt_5.0.0_3.0_1687371699306.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_pt_5.0.0_3.0_1687371699306.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file From b1dd662ac64fa606afd715fce8cf0c08bc393372 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:23:03 +0700 Subject: [PATCH 142/149] Add model 2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar --- ...t_embeddings_bert_base_qarib60_1790k_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md new file mode 100644 index 00000000000000..82419381d32b36 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, 1790k Iterations) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib60_1790k +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib60_1790k` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_1790k_ar_5.0.0_3.0_1687371740065.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_1790k_ar_5.0.0_3.0_1687371740065.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_1790k").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_1790k").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib60_1790k| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file From 12b020ac0d720b5220c86d9fe896e09c462d077f Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:27:15 +0700 Subject: [PATCH 143/149] Add model 2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en --- ...t_embeddings_bert_base_uncased_dstc9_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md new file mode 100644 index 00000000000000..082e97e5cb0c38 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from wilsontam) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_dstc9 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-dstc9` is a English model orginally trained by `wilsontam`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_dstc9_en_5.0.0_3.0_1687372017097.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_dstc9_en_5.0.0_3.0_1687372017097.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_dstc9").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_dstc9").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_dstc9| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file From c519aad5bd8d9f7d3a708aab4c14140fccd0835f Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:33:51 +0700 Subject: [PATCH 144/149] Add model 2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en --- ...sparse_70_unstructured_no_classifier_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md new file mode 100644 index 00000000000000..ec8779ce8600e9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Unstructured, Without Classifier Layer) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-mnli-sparse-70-unstructured-no-classifier` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en_5.0.0_3.0_1687372422470.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en_5.0.0_3.0_1687372422470.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|225.9 MB| +|Case sensitive:|true| \ No newline at end of file From 37bac21dc011bfac105d71872e09aaee4208ffd5 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:35:33 +0700 Subject: [PATCH 145/149] Add model 2023-06-21-bert_embeddings_bert_base_qarib_ar --- ...6-21-bert_embeddings_bert_base_qarib_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md new file mode 100644 index 00000000000000..1af0625cf15067 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib_ar_5.0.0_3.0_1687372513972.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib_ar_5.0.0_3.0_1687372513972.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.0 MB| +|Case sensitive:|true| \ No newline at end of file From 07240b1f4321af75006228c29f57e7ea54917bbb Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:37:10 +0700 Subject: [PATCH 146/149] Add model 2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en --- ..._base_uncased_sparse_70_unstructured_en.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md new file mode 100644 index 00000000000000..0d1e90b869920a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Unstructured) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_sparse_70_unstructured +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-sparse-70-unstructured` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_sparse_70_unstructured_en_5.0.0_3.0_1687372619550.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_sparse_70_unstructured_en_5.0.0_3.0_1687372619550.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_sparse_70_unstructured").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_sparse_70_unstructured").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_sparse_70_unstructured| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|225.8 MB| +|Case sensitive:|true| \ No newline at end of file From fcff5e709786c17851cade976f39f54fada0a823 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:38:10 +0700 Subject: [PATCH 147/149] Add model 2023-06-21-ms_bluebert_base_uncased_en --- .../2023-06-21-ms_bluebert_base_uncased_en.md | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md new file mode 100644 index 00000000000000..bf7539ce60805d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md @@ -0,0 +1,106 @@ +--- +layout: model +title: MS-BERT base model (uncased) +author: John Snow Labs +name: ms_bluebert_base_uncased +date: 2023-06-21 +tags: [embeddings, bert, open_source, en, clinical, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is trained by taking BlueBert as the base model, and training on dataset contained approximately 75,000 clinical notes, for about 5000 patients, totaling to over 35.7 million words. These notes were collected from patients who visited St. Michael's Hospital MS Clinic between 2015 to 2019. The notes contained a variety of information pertaining to a neurological exam. For example, a note can contain information on the patient's condition, their progress over time and diagnosis. + +BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was pretrained with two objectives: + +Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. +Next sentence prediction (NSP): the models concatenate two masked sentences as inputs during pretraining. Sometimes they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to predict if the two sentences were following each other or not. This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ms_bluebert_base_uncased_en_5.0.0_3.0_1687372625112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/ms_bluebert_base_uncased_en_5.0.0_3.0_1687372625112.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` +
+ +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` +
+ +## Results + +```bash +Results + + +Generates 768 dimensional embeddings per token + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ms_bluebert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.2 MB| +|Case sensitive:|false| + +## References + +https://huggingface.co/NLP4H/ms_bert \ No newline at end of file From 46b21ee563238d08a006d673dace46e5a4651d35 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 01:44:37 +0700 Subject: [PATCH 148/149] Add model 2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar --- ...rt_embeddings_bert_base_qarib60_860k_ar.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md new file mode 100644 index 00000000000000..b4cba476c77437 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, 860k Iterations) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib60_860k +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib60_860k` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_860k_ar_5.0.0_3.0_1687373057769.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_860k_ar_5.0.0_3.0_1687373057769.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_860k").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_860k").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib60_860k| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file From 7f09b85315f49deb774989ff93a7a0440d3cd8ec Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Thu, 22 Jun 2023 00:27:47 +0500 Subject: [PATCH 149/149] fixing wrong spark version and removing tensorflow --- .../2023-06-21-bert_base_uncased_contracts_en.md | 6 +++--- .../2023-06-21-bert_embeddings_InCaseLawBERT_en.md | 6 +++--- .../2023-06-21-bert_embeddings_InLegalBERT_en.md | 6 +++--- ...23-06-21-bert_embeddings_base_uncased_issues_128_en.md | 8 ++++---- ...mbeddings_bioclinicalbert_finetuned_covid_papers_en.md | 6 +++--- ...06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md | 8 ++++---- ...beddings_chemical_uncased_finetuned_cust_c1_cust_en.md | 8 ++++---- ...rt_embeddings_chemical_uncased_finetuned_cust_c2_en.md | 8 ++++---- .../2023-06-21-bert_embeddings_distil_clinical_en.md | 8 ++++---- .../2023-06-21-bert_embeddings_finest_bert_en.md | 6 +++--- .../2023-06-21-bert_embeddings_gbert_base_de.md | 6 +++--- ...bert_embeddings_german_financial_statements_bert_de.md | 6 +++--- .../2023-06-21-bert_embeddings_hateBERT_en.md | 6 +++--- ...06-21-bert_embeddings_indic_transformers_hi_bert_hi.md | 6 +++--- .../2023-06-21-bert_embeddings_javanese_bert_small_jv.md | 6 +++--- .../2023-06-21-bert_embeddings_jobbert_base_cased_en.md | 6 +++--- .../2023-06-21-bert_embeddings_legalbert_adept_en.md | 6 +++--- .../2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md | 8 ++++---- ...-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md | 8 ++++---- .../2023-06-21-bert_embeddings_pretrain_ko.md | 8 ++++---- ...ert_embeddings_scibert_scivocab_finetuned_cord19_en.md | 6 +++--- .../2023-06-21-bert_embeddings_sec_bert_base_en.md | 6 +++--- .../2023-06-21-bert_embeddings_sec_bert_sh_en.md | 6 +++--- .../2023-06-21-bert_sentence_embeddings_financial_de.md | 6 +++--- .../2023-06-21-biobert_pubmed_base_cased_v1.2_en.md | 6 +++--- ...mbeddings_electra_base_gc4_64k_0_cased_generator_de.md | 6 +++--- ...ngs_electra_base_gc4_64k_1000000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_100000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_200000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_300000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_400000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_500000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_600000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_700000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_800000_cased_generator_de.md | 6 +++--- ...ings_electra_base_gc4_64k_900000_cased_generator_de.md | 6 +++--- ...-06-21-electra_embeddings_electra_base_generator_en.md | 6 +++--- ...eddings_electra_base_italian_xxl_cased_generator_it.md | 6 +++--- ...eddings_electra_base_turkish_mc4_cased_generator_tr.md | 6 +++--- ...dings_electra_base_turkish_mc4_uncased_generator_tr.md | 6 +++--- ...06-21-electra_embeddings_electra_large_generator_en.md | 6 +++--- ...06-21-electra_embeddings_electra_small_generator_en.md | 6 +++--- ...ctra_embeddings_electra_small_japanese_generator_ja.md | 6 +++--- ..._embeddings_electra_tagalog_base_cased_generator_tl.md | 6 +++--- ...mbeddings_electra_tagalog_base_uncased_generator_tl.md | 6 +++--- ...embeddings_electra_tagalog_small_cased_generator_tl.md | 4 ++-- ...beddings_electra_tagalog_small_uncased_generator_tl.md | 6 +++--- ...1-electra_embeddings_electricidad_base_generator_es.md | 6 +++--- ...ctra_embeddings_finance_koelectra_base_generator_ko.md | 6 +++--- ...tra_embeddings_finance_koelectra_small_generator_ko.md | 6 +++--- ...06-21-electra_embeddings_gelectra_base_generator_de.md | 6 +++--- ...6-21-electra_embeddings_gelectra_large_generator_de.md | 6 +++--- ...6-21-electra_embeddings_koelectra_base_generator_ko.md | 6 +++--- ...1-electra_embeddings_koelectra_base_v2_generator_ko.md | 6 +++--- ...1-electra_embeddings_koelectra_base_v3_generator_ko.md | 6 +++--- ...-21-electra_embeddings_koelectra_small_generator_ko.md | 6 +++--- ...23-06-21-electra_embeddings_kr_electra_generator_ko.md | 6 +++--- docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md | 6 +++--- .../_posts/ahmedlone127/2023-06-21-legalectra_small_es.md | 6 +++--- 59 files changed, 184 insertions(+), 184 deletions(-) diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md index 7940efd351cf2b..2b1a76c0a31802 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md @@ -8,7 +8,7 @@ tags: [open_source, bert, embeddings, finance, contracts, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Word Embeddings model, trained on legal contracts, adapted from Huggi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.4_1687337099443.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.4_1687337099443.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md index bb601d4aa558d8..df9c431d1607c0 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md @@ -8,7 +8,7 @@ tags: [bert, en, embeddings, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.4_1687336500304.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.4_1687336500304.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.0_1687336500304.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.0_1687336500304.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md index c485e4c203f876..2a5c249b1099dc 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md @@ -8,7 +8,7 @@ tags: [bert, en, embeddings, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.4_1687336959265.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.4_1687336959265.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.0_1687336959265.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.0_1687336959265.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md index 7a81f1682737e4..5679c83687a692 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md @@ -4,11 +4,11 @@ title: English Bert Embeddings Cased model (from antoinev17) author: John Snow Labs name: bert_embeddings_base_uncased_issues_128 date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -29,8 +29,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.4_1687336183958.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.4_1687336183958.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.0_1687336183958.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.0_1687336183958.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md index ae97a5cf8eece2..259fb891ba3560 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md @@ -8,7 +8,7 @@ tags: [en, open_source, bert, embeddings, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.4_1687337369326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.4_1687337369326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.0_1687337369326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.0_1687337369326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md index e76ff9449ab0d3..2f13e1e7ba8bd4 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md @@ -4,11 +4,11 @@ title: English Bert Embeddings Cased model (from aditeyabaral) author: John Snow Labs name: bert_embeddings_carlbert_webex_mlm_spatial date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.4_1687334153231.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.4_1687334153231.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.0_1687334153231.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.0_1687334153231.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md index e4feb2c5bd8623..1d3affe542dc5c 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md @@ -4,11 +4,11 @@ title: English Bert Embeddings Cased model (from Shafin) author: John Snow Labs name: bert_embeddings_chemical_uncased_finetuned_cust_c1_cust date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.4_1687335830911.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.4_1687335830911.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.0_1687335830911.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.0_1687335830911.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md index 1243bba960dfaf..02fdd7d0894952 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md @@ -4,11 +4,11 @@ title: English Bert Embeddings Cased model (from Shafin) author: John Snow Labs name: bert_embeddings_chemical_uncased_finetuned_cust_c2 date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.4_1687335658105.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.4_1687335658105.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.0_1687335658105.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.0_1687335658105.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md index 33f643e2799896..e29412f032a148 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md @@ -4,11 +4,11 @@ title: English Bert Embeddings Cased model (from nlpie) author: John Snow Labs name: bert_embeddings_distil_clinical date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.4_1687334036385.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.4_1687334036385.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.0_1687334036385.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.0_1687334036385.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md index 4ab11423e6fe62..e01933d053546d 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, fi, et, en, xx, multilingual, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.4_1687339089124.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.4_1687339089124.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.0_1687339089124.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.0_1687339089124.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md index e142d8aa5e7623..f7c53a48bd8905 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, de, open_source, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.4_1687339723694.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.4_1687339723694.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.0_1687339723694.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.0_1687339723694.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md index 6a53f96a8fc12e..bd29b725b81ae5 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, de, open_source, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Financial Bert Word Embeddings model, trained on German Financial Sta {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.4_1687339007310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.4_1687339007310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.0_1687339007310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.0_1687339007310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md index 5d95f85bb63367..82f3c85d2247d3 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, en, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.4_1687340123478.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.4_1687340123478.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.0_1687340123478.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.0_1687340123478.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md index 4f0ec15e3eda26..cb1d065f6341ec 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, hi, open_source, onnx] task: Embeddings language: hi edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.4_1687339963111.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.4_1687339963111.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.0_1687339963111.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.0_1687339963111.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md index 424b3c45fbd045..92986039f002be 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, jv, open_source, onnx] task: Embeddings language: jv edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.4_1687339377809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.4_1687339377809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.0_1687339377809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.0_1687339377809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md index db07e2246747a5..ac620f546f44ce 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md @@ -8,7 +8,7 @@ tags: [bert, en, embeddings, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.4_1687336524220.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.4_1687336524220.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.0_1687336524220.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.0_1687336524220.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md index c6fb37952b5c59..5dc84ccc68d550 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md @@ -8,7 +8,7 @@ tags: [bert, en, english, embeddings, transformer, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.4_1687335917569.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.4_1687335917569.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.0_1687335917569.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.0_1687335917569.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md index dc8d5a0bba5e06..022feacea52d3c 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md @@ -4,11 +4,11 @@ title: English Legal BERT Embeddings author: John Snow Labs name: bert_embeddings_lsg16k_Italian_Legal date: 2023-06-21 -tags: [longformer, it, italian, embeddings, transformer, open_source, tensorflow, onnx] +tags: [longformer, it, italian, embeddings, transformer, open_source, onnx] task: Embeddings language: it edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.4_1687335744395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.4_1687335744395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.0_1687335744395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.0_1687335744395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md index 371264a43afc17..3566124863b301 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md @@ -4,11 +4,11 @@ title: English Bert Embeddings Cased model (from Tristan) author: John Snow Labs name: bert_embeddings_olm_base_uncased_oct_2022 date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.4_1687336305222.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.4_1687336305222.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.0_1687336305222.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.0_1687336305222.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md index 530f8a4947793c..0308fcbcd07582 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md @@ -4,11 +4,11 @@ title: Korean Bert Embeddings Cased model (from onlydj96) author: John Snow Labs name: bert_embeddings_pretrain date: 2023-06-21 -tags: [open_source, bert, bert_embeddings, bertformaskedlm, ko, tensorflow, onnx] +tags: [open_source, bert, bert_embeddings, bertformaskedlm, ko, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provid {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.4_1687336252702.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.4_1687336252702.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.0_1687336252702.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.0_1687336252702.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md index 5376ed57e932ae..1e641bcf5d79b3 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md @@ -8,7 +8,7 @@ tags: [en, open_source, bert, embeddings, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provi {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.4_1687336817133.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.4_1687336817133.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.0_1687336817133.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.0_1687336817133.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md index 2be531ad5a4f8d..8e1158bbb801c8 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, en, open_source, financial, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -33,8 +33,8 @@ If you are interested in Financial Embeddings, take a look also at these two mod {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.4_1687339042219.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.4_1687339042219.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.0_1687339042219.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.0_1687339042219.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md index 67cc5c1d89bcf3..0958df101c9131 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, en, open_source, financial, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -33,8 +33,8 @@ If you are interested in Financial Embeddings, take a look also at these two mod {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.4_1687339128341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.4_1687339128341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.0_1687339128341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.0_1687339128341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md index f7a957e5d314da..13d2898c521b50 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, de, open_source, financial, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -30,8 +30,8 @@ Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted an {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.4_1687338810949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.4_1687338810949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.0_1687338810949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.0_1687338810949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md index c7d99cb4660296..794836f2f29942 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md @@ -8,7 +8,7 @@ tags: [bert, embeddings, en, open_source, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ This model is the v1.2 of [biobert_pubmed_base_cased](https://nlp.johnsnowlabs.c {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.4_1687336480762.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.4_1687336480762.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.0_1687336480762.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.0_1687336480762.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md index e342e0a50c0654..eb920c7ffb15b9 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687338403600.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.4_1687338403600.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.0_1687338403600.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.0_1687338403600.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md index 010216d521b613..d6613cc7e8cf70 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.4_1687337566476.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.4_1687337566476.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.0_1687337566476.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.0_1687337566476.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md index c94835e08e9c8a..b1110b5c50cebd 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.4_1687337430315.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.4_1687337430315.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.0_1687337430315.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.0_1687337430315.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md index 2c1aec5f485cb5..e90872b5cf0574 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.4_1687337323809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.4_1687337323809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.0_1687337323809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.0_1687337323809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md index 1d5f7c779e4a9b..e9a244fe395167 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.4_1687337742127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.4_1687337742127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.0_1687337742127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.0_1687337742127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md index e7d78cc73de8e1..cf463da7d3ede6 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687338531671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.4_1687338531671.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.0_1687338531671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.0_1687338531671.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md index 34fc199ce62012..0e0368cae00d7c 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.4_1687337310787.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.4_1687337310787.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.0_1687337310787.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.0_1687337310787.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md index f29eff6d36ec34..3f63d4ca68b519 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687338289447.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.4_1687338289447.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.0_1687338289447.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.0_1687338289447.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md index 74536373269704..6ce95cc170b433 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.4_1687336559193.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.4_1687336559193.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.0_1687336559193.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.0_1687336559193.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md index 2ee16b3cc0b382..b3e7e29f1c1fe9 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.4_1687336668760.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.4_1687336668760.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.0_1687336668760.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.0_1687336668760.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md index 08ffc8924552f0..9e2a0a0531c231 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.4_1687336789214.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.4_1687336789214.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.0_1687336789214.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.0_1687336789214.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md index 1134d2ff11d8f7..e83a14d6f6ba62 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md @@ -8,7 +8,7 @@ tags: [en, open_source, electra, embeddings, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.4_1687337315482.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.4_1687337315482.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.0_1687337315482.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.0_1687337315482.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md index e7d940835a2164..b650a25e829416 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md @@ -8,7 +8,7 @@ tags: [it, open_source, electra, embeddings, onnx] task: Embeddings language: it edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.4_1687337384147.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.4_1687337384147.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.0_1687337384147.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.0_1687337384147.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md index 604b5b0ec2c299..e41b06ae521f40 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md @@ -8,7 +8,7 @@ tags: [tr, open_source, electra, embeddings, onnx] task: Embeddings language: tr edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.4_1687337596423.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.4_1687337596423.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.0_1687337596423.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.0_1687337596423.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md index 27834e8f8f4ba8..41859cc555c6a2 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md @@ -8,7 +8,7 @@ tags: [tr, open_source, electra, embeddings, onnx] task: Embeddings language: tr edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.4_1687337246703.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.4_1687337246703.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.0_1687337246703.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.0_1687337246703.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md index ea273b2caeb31c..bbfb7f281e7a49 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md @@ -8,7 +8,7 @@ tags: [en, open_source, electra, embeddings, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.4_1687337805375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.4_1687337805375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.0_1687337805375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.0_1687337805375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md index 65889ff972397c..dabe96a7d7b5a2 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md @@ -8,7 +8,7 @@ tags: [en, open_source, electra, embeddings, onnx] task: Embeddings language: en edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.4_1687337729115.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.4_1687337729115.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.0_1687337729115.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.0_1687337729115.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md index 6944eeb7698db4..29d87ba99e8341 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md @@ -8,7 +8,7 @@ tags: [ja, open_source, electra, embeddings, onnx] task: Embeddings language: ja edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.4_1687338737717.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.4_1687338737717.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.0_1687338737717.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.0_1687338737717.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md index 0a11f3ced2e5f2..84aeba9befe4a7 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md @@ -8,7 +8,7 @@ tags: [tl, open_source, electra, embeddings, onnx] task: Embeddings language: tl edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.4_1687338660491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.4_1687338660491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.0_1687338660491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.0_1687338660491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md index cb0c52829db71c..e48ebc2ee91dfd 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md @@ -8,7 +8,7 @@ tags: [tl, open_source, electra, embeddings, onnx] task: Embeddings language: tl edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.4_1687338703736.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.4_1687338703736.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.0_1687338703736.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.0_1687338703736.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md index b132ce16178180..df146d8836dce7 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.4_1687338628903.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.4_1687338628903.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.0_1687338628903.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.0_1687338628903.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md index 8962b0c8151a21..58cea57ca00683 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md @@ -8,7 +8,7 @@ tags: [tl, open_source, electra, embeddings, onnx] task: Embeddings language: tl edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.4_1687338586547.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.4_1687338586547.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.0_1687338586547.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.0_1687338586547.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md index f8d5fb802d26dc..be35b303a82993 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md @@ -8,7 +8,7 @@ tags: [es, open_source, electra, embeddings, onnx] task: Embeddings language: es edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.4_1687337686007.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.4_1687337686007.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.0_1687337686007.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.0_1687337686007.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md index 49fffd2af1f284..af1b52acd3ac9e 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.4_1687337679070.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.4_1687337679070.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.0_1687337679070.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.0_1687337679070.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md index 5955e86f590f3a..76acc791540c7c 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.4_1687338677896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.4_1687338677896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.0_1687338677896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.0_1687338677896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md index cd680ca947182c..35d0fddb181a54 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.4_1687338626775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.4_1687338626775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.0_1687338626775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.0_1687338626775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md index 719a9cd57ed8a0..6d2e16d4eeefb6 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md @@ -8,7 +8,7 @@ tags: [de, open_source, electra, embeddings, onnx] task: Embeddings language: de edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.4_1687338033613.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.4_1687338033613.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.0_1687338033613.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.0_1687338033613.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md index 42848be76dc939..53e791d787729c 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.4_1687337873576.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.4_1687337873576.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.0_1687337873576.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.0_1687337873576.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md index f53d78f8396e90..94addd53290ea6 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.4_1687337792559.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.4_1687337792559.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.0_1687337792559.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.0_1687337792559.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md index 0cf5183f230459..d08cb9f7252863 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.4_1687337798528.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.4_1687337798528.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.0_1687337798528.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.0_1687337798528.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md index f5881a5cbb4be8..e323b62a1bbec4 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.4_1687338723919.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.4_1687338723919.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.0_1687338723919.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.0_1687338723919.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md index 146c04252c2129..0aeb0d295a40c1 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md @@ -8,7 +8,7 @@ tags: [ko, open_source, electra, embeddings, onnx] task: Embeddings language: ko edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Electra Embeddings model, adapted from Hugging Face and curated to pr {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.4_1687338860027.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.4_1687338860027.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.0_1687338860027.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.0_1687338860027.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md index c18507c00caedd..ed972bedaa7edf 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md @@ -8,7 +8,7 @@ tags: [open_source, legalectra, embeddings, electra, legal, es, onnx] task: Embeddings language: es edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and cu {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.4_1687336669896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.4_1687336669896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.0_1687336669896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.0_1687336669896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md index 8f217918038423..a2fb304397a6a0 100644 --- a/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md @@ -8,7 +8,7 @@ tags: [open_source, legalectra, embeddings, electra, legal, small, es, onnx] task: Embeddings language: es edition: Spark NLP 5.0.0 -spark_version: 3.4 +spark_version: 3.0 supported: true engine: onnx annotator: BertEmbeddings @@ -28,8 +28,8 @@ Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and cu {:.btn-box} -[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.4_1687336489949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} -[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.4_1687336489949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.0_1687336489949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.0_1687336489949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} ## How to use