diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md new file mode 100644 index 00000000000000..2b1a76c0a31802 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_base_uncased_contracts_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Legal Contracts BertEmbeddings model (Base, Uncased) +author: John Snow Labs +name: bert_base_uncased_contracts +date: 2023-06-21 +tags: [open_source, bert, embeddings, finance, contracts, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Word Embeddings model, trained on legal contracts, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-contracts` is a English model originally trained by `nlpaueb`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_uncased_contracts_en_5.0.0_3.0_1687337099443.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP.").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_base_uncased_contracts","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP.").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.contracts.uncased_base").predict("""I love Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_uncased_contracts| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md new file mode 100644 index 00000000000000..a8bde5a9373131 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_ARBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (ARBERT model) +author: John Snow Labs +name: bert_embeddings_ARBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `ARBERT` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_ARBERT_ar_5.0.0_3.0_1687368387135.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_ARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arbert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_ARBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|605.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md new file mode 100644 index 00000000000000..507d80ddf48d81 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_AraBertMo_base_V1_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from Ebtihal) +author: John Snow Labs +name: bert_embeddings_AraBertMo_base_V1 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `AraBertMo_base_V1` is a Arabic model orginally trained by `Ebtihal`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_AraBertMo_base_V1_ar_5.0.0_3.0_1687367402700.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_AraBertMo_base_V1_ar_5.0.0_3.0_1687367402700.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.AraBertMo_base_V1").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_AraBertMo_base_V1","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.AraBertMo_base_V1").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_AraBertMo_base_V1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|407.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md new file mode 100644 index 00000000000000..aa25c8bbbd15f0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_Ara_DialectBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from MutazYoune) +author: John Snow Labs +name: bert_embeddings_Ara_DialectBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `Ara_DialectBERT` is a Arabic model orginally trained by `MutazYoune`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_Ara_DialectBERT_ar_5.0.0_3.0_1687367717615.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_Ara_DialectBERT_ar_5.0.0_3.0_1687367717615.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.Ara_DialectBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_Ara_DialectBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.Ara_DialectBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_Ara_DialectBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md new file mode 100644 index 00000000000000..1156a182032942 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_COVID_SciBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from lordtt13) +author: John Snow Labs +name: bert_embeddings_COVID_SciBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `COVID-SciBERT` is a English model orginally trained by `lordtt13`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_COVID_SciBERT_en_5.0.0_3.0_1687368450114.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_COVID_SciBERT_en_5.0.0_3.0_1687368450114.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.COVID_SciBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_COVID_SciBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.COVID_SciBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_COVID_SciBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|412.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md new file mode 100644 index 00000000000000..019db2765f56c4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_DarijaBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from Kamel) +author: John Snow Labs +name: bert_embeddings_DarijaBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `DarijaBERT` is a Arabic model orginally trained by `Kamel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_DarijaBERT_ar_5.0.0_3.0_1687367582690.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_DarijaBERT_ar_5.0.0_3.0_1687367582690.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.DarijaBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_DarijaBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.DarijaBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_DarijaBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|551.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md new file mode 100644 index 00000000000000..f209c346ba5ecc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_FinancialBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Financial Bert Embeddings +author: John Snow Labs +name: bert_embeddings_FinancialBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `FinancialBERT` is a English Financial model orginally trained on a very large corpus of financial texts including Earnings Calls, Corporate reports, Bloomberg News, TRC2-financial. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_FinancialBERT_en_5.0.0_3.0_1687368067375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_FinancialBERT_en_5.0.0_3.0_1687368067375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.FinancialBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_FinancialBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.FinancialBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_FinancialBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md new file mode 100644 index 00000000000000..6f9f54bac44943 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_German_MedBERT_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Medical Bert Embeddings +author: John Snow Labs +name: bert_embeddings_German_MedBERT +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained German Medical Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `German-MedBERT` is a German model orginally trained by `smanjil`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_German_MedBERT_de_5.0.0_3.0_1687367757622.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_German_MedBERT_de_5.0.0_3.0_1687367757622.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.medbert").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_German_MedBERT","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.medbert").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_German_MedBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md new file mode 100644 index 00000000000000..df9c431d1607c0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InCaseLawBERT_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English BERT Embeddings (from law-ai) +author: John Snow Labs +name: bert_embeddings_InCaseLawBERT +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `InCaseLawBERT` is a English model originally trained by `law-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.0_1687336500304.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InCaseLawBERT_en_5.0.0_3.0_1687336500304.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InCaseLawBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_InCaseLawBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md new file mode 100644 index 00000000000000..2a5c249b1099dc --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_InLegalBERT_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Legal English BERT Embeddings (from law-ai) +author: John Snow Labs +name: bert_embeddings_InLegalBERT +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `InLegalBERT` is a English model originally trained by `law-ai`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.0_1687336959265.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_InLegalBERT_en_5.0.0_3.0_1687336959265.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_InLegalBERT","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_InLegalBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md new file mode 100644 index 00000000000000..df8e6020ce97b2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERT_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (MARBERT model) +author: John Snow Labs +name: bert_embeddings_MARBERT +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MARBERT` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERT_ar_5.0.0_3.0_1687367317123.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERT_ar_5.0.0_3.0_1687367317123.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERT").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERT","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERT").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_MARBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|608.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md new file mode 100644 index 00000000000000..26d222b5c236ee --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_MARBERTv2_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (MARBERT model v2) +author: John Snow Labs +name: bert_embeddings_MARBERTv2 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `MARBERTv2` is a Arabic model orginally trained by `UBC-NLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERTv2_ar_5.0.0_3.0_1687354749271.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_MARBERTv2_ar_5.0.0_3.0_1687354749271.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERTv2").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_MARBERTv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.MARBERTv2").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_MARBERTv2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|606.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md new file mode 100644 index 00000000000000..0f8dd6b3d732db --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_agriculture_bert_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Agriculture) +author: John Snow Labs +name: bert_embeddings_agriculture_bert_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `agriculture-bert-uncased` is a English model orginally trained by `recobo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_agriculture_bert_uncased_en_5.0.0_3.0_1687368891491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_agriculture_bert_uncased_en_5.0.0_3.0_1687368891491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.agriculture_bert_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_agriculture_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.agriculture_bert_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_agriculture_bert_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md new file mode 100644 index 00000000000000..334f6947d0078e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_alberti_bert_base_multilingual_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from flax-community) +author: John Snow Labs +name: bert_embeddings_alberti_bert_base_multilingual_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `alberti-bert-base-multilingual-cased` is a Spanish model orginally trained by `flax-community`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_alberti_bert_base_multilingual_cased_es_5.0.0_3.0_1687368551885.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_alberti_bert_base_multilingual_cased_es_5.0.0_3.0_1687368551885.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.alberti_bert_base_multilingual_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_alberti_bert_base_multilingual_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.alberti_bert_base_multilingual_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_alberti_bert_base_multilingual_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|664.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md new file mode 100644 index 00000000000000..d94b26d12cfe68 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_arabert_c19_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Arabert model, Covid-19) +author: John Snow Labs +name: bert_embeddings_arabert_c19 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `arabert_c19` is a Arabic model orginally trained by `moha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_arabert_c19_ar_5.0.0_3.0_1687369343067.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_arabert_c19_ar_5.0.0_3.0_1687369343067.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arabert_c19").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_arabert_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.arabert_c19").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_arabert_c19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md new file mode 100644 index 00000000000000..05609a11fcf313 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_base_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bangla_bert_base +date: 2023-06-21 +tags: [bert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bangla-bert-base` is a Bangla model orginally trained by `sagorsarker`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_base_bn_5.0.0_3.0_1687370097955.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_base_bn_5.0.0_3.0_1687370097955.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangala_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert_base","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangala_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bangla_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|614.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md new file mode 100644 index 00000000000000..27aac09a7bc754 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bangla_bert_bn.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Bangla Bert Embeddings (from Kowsher) +author: John Snow Labs +name: bert_embeddings_bangla_bert +date: 2023-06-21 +tags: [bert, embeddings, bn, open_source, onnx] +task: Embeddings +language: bn +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bangla-bert` is a Bangla model orginally trained by `Kowsher`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_bn_5.0.0_3.0_1687369015466.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bangla_bert_bn_5.0.0_3.0_1687369015466.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangla_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["আমি স্পার্ক এনএলপি ভালোবাসি"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bangla_bert","bn") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("আমি স্পার্ক এনএলপি ভালোবাসি").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("bn.embed.bangla_bert").predict("""আমি স্পার্ক এনএলপি ভালোবাসি""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bangla_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|bn| +|Size:|612.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md new file mode 100644 index 00000000000000..5679c83687a692 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_base_uncased_issues_128_en.md @@ -0,0 +1,141 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from antoinev17) +author: John Snow Labs +name: bert_embeddings_base_uncased_issues_128 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert-base-uncased-issues-128` is a English model originally trained by `antoinev17 +`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.0_1687336183958.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_base_uncased_issues_128_en_5.0.0_3.0_1687336183958.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_base_uncased_issues_128","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_base_uncased_issues_128| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md new file mode 100644 index 00000000000000..99f607f5c1530b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_5lang_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from amine) +author: John Snow Labs +name: bert_embeddings_bert_base_5lang_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-5lang-cased` is a Spanish model orginally trained by `amine`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_5lang_cased_es_5.0.0_3.0_1687370074087.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_5lang_cased_es_5.0.0_3.0_1687370074087.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert_base_5lang_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_5lang_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert_base_5lang_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_5lang_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|461.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md new file mode 100644 index 00000000000000..e8151fef004624 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabert_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model) +author: John Snow Labs +name: bert_embeddings_bert_base_arabert +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabert` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabert_ar_5.0.0_3.0_1687370767272.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabert_ar_5.0.0_3.0_1687370767272.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md new file mode 100644 index 00000000000000..b0ffc23c7e5de1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv01_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v01) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv01 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv01` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv01_ar_5.0.0_3.0_1687370107542.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv01_ar_5.0.0_3.0_1687370107542.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv01").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv01","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv01").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv01| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md new file mode 100644 index 00000000000000..00a46a50c9de69 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v02) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv02 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv02` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_ar_5.0.0_3.0_1687369054270.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_ar_5.0.0_3.0_1687369054270.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv02| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md new file mode 100644 index 00000000000000..527a150609fdb3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv02_twitter_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v02, Twitter) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv02_twitter +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv02-twitter` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_twitter_ar_5.0.0_3.0_1687367879067.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv02_twitter_ar_5.0.0_3.0_1687367879067.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02_twitter").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv02_twitter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv02_twitter").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv02_twitter| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|505.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md new file mode 100644 index 00000000000000..28aa7881be3bad --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabertv2_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Arabert Model, v2) +author: John Snow Labs +name: bert_embeddings_bert_base_arabertv2 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabertv2` is a Arabic model orginally trained by `aubmindlab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv2_ar_5.0.0_3.0_1687366696592.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabertv2_ar_5.0.0_3.0_1687366696592.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv2").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabertv2","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabertv2").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabertv2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md new file mode 100644 index 00000000000000..ec1f8a7ed29f13 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_ar_5.0.0_3.0_1687367514433.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_ar_5.0.0_3.0_1687367514433.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|412.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md new file mode 100644 index 00000000000000..2db1ef256735f2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_mix_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, DA-CA-MSA variants) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_mix +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-mix` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_mix_ar_5.0.0_3.0_1687366836156.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_mix_ar_5.0.0_3.0_1687366836156.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_mix").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_mix","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_mix").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_mix| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md new file mode 100644 index 00000000000000..97bcdf7edd7fe2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_ar_5.0.0_3.0_1687355261025.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_ar_5.0.0_3.0_1687355261025.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md new file mode 100644 index 00000000000000..6d2176403c972d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on an eighth of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_eighth +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-eighth` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar_5.0.0_3.0_1687366398028.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_eighth_ar_5.0.0_3.0_1687366398028.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_eighth").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_eighth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_eighth").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_eighth| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md new file mode 100644 index 00000000000000..1ef43767d97397 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_half_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a half of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_half +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-half` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_half_ar_5.0.0_3.0_1687355081033.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_half_ar_5.0.0_3.0_1687355081033.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_half").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_half","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_half").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_half| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md new file mode 100644 index 00000000000000..622c54ebe635f9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a quarter of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_quarter +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-quarter` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar_5.0.0_3.0_1687366524279.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_quarter_ar_5.0.0_3.0_1687366524279.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_quarter").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_quarter","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_quarter").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_quarter| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md new file mode 100644 index 00000000000000..efc6980941ceab --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, Trained on a sixteenth of the full MSA dataset) +author: John Snow Labs +name: bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-arabic-camelbert-msa-sixteenth` is a Arabic model orginally trained by `CAMeL-Lab`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar_5.0.0_3.0_1687366813331.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth_ar_5.0.0_3.0_1687366813331.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_sixteenth").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_arabic_camelbert_msa_sixteenth").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_arabic_camelbert_msa_sixteenth| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|406.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md new file mode 100644 index 00000000000000..ae5920ac1cb795 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_cased_pt_lenerbr_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Legal Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_cased_pt_lenerbr +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-cased-pt-lenerbr` is a Portuguese model orginally trained by `pierreguillou`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_cased_pt_lenerbr_pt_5.0.0_3.0_1687354957150.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_cased_pt_lenerbr_pt_5.0.0_3.0_1687354957150.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_cased_pt_lenerbr").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_cased_pt_lenerbr","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_cased_pt_lenerbr").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_cased_pt_lenerbr| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md new file mode 100644 index 00000000000000..1f985340872ac9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_cased_oldvocab_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (Base, Cased, Old Vocabulary) +author: John Snow Labs +name: bert_embeddings_bert_base_german_cased_oldvocab +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-german-cased-oldvocab` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_cased_oldvocab_de_5.0.0_3.0_1687355117712.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_cased_oldvocab_de_5.0.0_3.0_1687355117712.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_cased_oldvocab").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_cased_oldvocab","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_cased_oldvocab").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_german_cased_oldvocab| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md new file mode 100644 index 00000000000000..91fa66ad63e5d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_german_uncased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bert_base_german_uncased +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-german-uncased` is a German model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_uncased_de_5.0.0_3.0_1687366506395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_german_uncased_de_5.0.0_3.0_1687366506395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_uncased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_german_uncased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_german_uncased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_german_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md new file mode 100644 index 00000000000000..5b67a75c4e936e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_gl_cased_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Bert Embeddings (Base, Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_gl_cased +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-gl-cased` is a Portuguese model orginally trained by `marcosgg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_gl_cased_pt_5.0.0_3.0_1687367086939.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_gl_cased_pt_5.0.0_3.0_1687367086939.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_gl_cased").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_gl_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_gl_cased").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_gl_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|664.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md new file mode 100644 index 00000000000000..4626d471479c49 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_historical_german_rw_cased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (from redewiedergabe) +author: John Snow Labs +name: bert_embeddings_bert_base_historical_german_rw_cased +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-historical-german-rw-cased` is a German model orginally trained by `redewiedergabe`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_historical_german_rw_cased_de_5.0.0_3.0_1687366604668.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_historical_german_rw_cased_de_5.0.0_3.0_1687366604668.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_historical_german_rw_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_historical_german_rw_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert_base_historical_german_rw_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_historical_german_rw_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md new file mode 100644 index 00000000000000..ab7513a407cb6c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_bert_base_italian_xxl_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-italian-xxl-cased` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_cased_it_5.0.0_3.0_1687367037078.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_cased_it_5.0.0_3.0_1687367037078.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_italian_xxl_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md new file mode 100644 index 00000000000000..8f1b62389ff59c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_italian_xxl_uncased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (Uncased) +author: John Snow Labs +name: bert_embeddings_bert_base_italian_xxl_uncased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-italian-xxl-uncased` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_uncased_it_5.0.0_3.0_1687366606479.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_italian_xxl_uncased_it_5.0.0_3.0_1687366606479.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_uncased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_italian_xxl_uncased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.bert_base_italian_xxl_uncased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_italian_xxl_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md new file mode 100644 index 00000000000000..586c43ffa392b7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings +author: John Snow Labs +name: bert_embeddings_bert_base +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base` is a Korean model orginally trained by `klue`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_ko_5.0.0_3.0_1687371079238.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_ko_5.0.0_3.0_1687371079238.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|412.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md new file mode 100644 index 00000000000000..1977eade89c812 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal Portuguese Embeddings (Base, Petitions) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased-finetuned-peticoes` is a Portuguese model orginally trained by `Luciano`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt_5.0.0_3.0_1687371316772.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes_pt_5.0.0_3.0_1687371316772.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_peticoes").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_peticoes").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased_finetuned_peticoes| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md new file mode 100644 index 00000000000000..51c244361ef7c6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal Portuguese Embeddings (Base, Agreements) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased-finetuned-tcu-acordaos` is a Portuguese model orginally trained by `Luciano`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt_5.0.0_3.0_1687371364352.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos_pt_5.0.0_3.0_1687371364352.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased_finetuned_tcu_acordaos").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased_finetuned_tcu_acordaos| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md new file mode 100644 index 00000000000000..7e8a494e05a9ea --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_portuguese_cased_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Portuguese Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_portuguese_cased +date: 2023-06-21 +tags: [bert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-portuguese-cased` is a Portuguese model orginally trained by `neuralmind`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_pt_5.0.0_3.0_1687371699306.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_portuguese_cased_pt_5.0.0_3.0_1687371699306.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased").predict("""Eu amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Eu amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_portuguese_cased","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Eu amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.bert_base_portuguese_cased").predict("""Eu amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_portuguese_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md new file mode 100644 index 00000000000000..82419381d32b36 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_1790k_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, 1790k Iterations) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib60_1790k +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib60_1790k` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_1790k_ar_5.0.0_3.0_1687371740065.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_1790k_ar_5.0.0_3.0_1687371740065.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_1790k").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_1790k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_1790k").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib60_1790k| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md new file mode 100644 index 00000000000000..b4cba476c77437 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib60_860k_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base, 860k Iterations) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib60_860k +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib60_860k` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_860k_ar_5.0.0_3.0_1687373057769.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib60_860k_ar_5.0.0_3.0_1687373057769.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_860k").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib60_860k","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib60_860k").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib60_860k| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md new file mode 100644 index 00000000000000..1af0625cf15067 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_qarib_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_bert_base_qarib +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-qarib` is a Arabic model orginally trained by `qarib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib_ar_5.0.0_3.0_1687372513972.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_qarib_ar_5.0.0_3.0_1687372513972.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_qarib","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_base_qarib").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_qarib| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|504.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md new file mode 100644 index 00000000000000..082e97e5cb0c38 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_dstc9_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from wilsontam) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_dstc9 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-dstc9` is a English model orginally trained by `wilsontam`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_dstc9_en_5.0.0_3.0_1687372017097.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_dstc9_en_5.0.0_3.0_1687372017097.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_dstc9").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_dstc9","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_dstc9").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_dstc9| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md new file mode 100644 index 00000000000000..ec8779ce8600e9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Unstructured, Without Classifier Layer) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-mnli-sparse-70-unstructured-no-classifier` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en_5.0.0_3.0_1687372422470.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier_en_5.0.0_3.0_1687372422470.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_mnli_sparse_70_unstructured_no_classifier").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_mnli_sparse_70_unstructured_no_classifier| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|225.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md new file mode 100644 index 00000000000000..0d1e90b869920a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_base_uncased_sparse_70_unstructured_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Unstructured) +author: John Snow Labs +name: bert_embeddings_bert_base_uncased_sparse_70_unstructured +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-base-uncased-sparse-70-unstructured` is a English model orginally trained by `Intel`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_sparse_70_unstructured_en_5.0.0_3.0_1687372619550.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_base_uncased_sparse_70_unstructured_en_5.0.0_3.0_1687372619550.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_sparse_70_unstructured").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_base_uncased_sparse_70_unstructured","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_base_uncased_sparse_70_unstructured").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_base_uncased_sparse_70_unstructured| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|225.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md new file mode 100644 index 00000000000000..cdf41dcad66847 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_kor_base_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings (from kykim) +author: John Snow Labs +name: bert_embeddings_bert_kor_base +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-kor-base` is a Korean model orginally trained by `kykim`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_kor_base_ko_5.0.0_3.0_1687369025243.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_kor_base_ko_5.0.0_3.0_1687369025243.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert_kor_base").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_kor_base","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.bert_kor_base").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_kor_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|441.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md new file mode 100644 index 00000000000000..e4ab0e46ec7075 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_medium_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Medium) +author: John Snow Labs +name: bert_embeddings_bert_medium_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-medium-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_medium_arabic_ar_5.0.0_3.0_1687370471346.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_medium_arabic_ar_5.0.0_3.0_1687370471346.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_medium_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_medium_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_medium_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_medium_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|157.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md new file mode 100644 index 00000000000000..d4f0e3cce03c7e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_mini_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Mini) +author: John Snow Labs +name: bert_embeddings_bert_mini_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-mini-arabic` is a Arabic model orginally trained by `asafaya`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_mini_arabic_ar_5.0.0_3.0_1687370518080.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_mini_arabic_ar_5.0.0_3.0_1687370518080.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_mini_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_mini_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.bert_mini_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_mini_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|43.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md new file mode 100644 index 00000000000000..ce6a900f6c10fe --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bert_political_election2020_twitter_mlm_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from kornosk) +author: John Snow Labs +name: bert_embeddings_bert_political_election2020_twitter_mlm +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `bert-political-election2020-twitter-mlm` is a English model orginally trained by `kornosk`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_political_election2020_twitter_mlm_en_5.0.0_3.0_1687370471142.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bert_political_election2020_twitter_mlm_en_5.0.0_3.0_1687370471142.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_political_election2020_twitter_mlm").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bert_political_election2020_twitter_mlm","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert_political_election2020_twitter_mlm").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bert_political_election2020_twitter_mlm| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md new file mode 100644 index 00000000000000..a0e6b0632c7e43 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_beto_gn_base_cased_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (from mmaguero) +author: John Snow Labs +name: bert_embeddings_beto_gn_base_cased +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `beto-gn-base-cased` is a Spanish model orginally trained by `mmaguero`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_beto_gn_base_cased_es_5.0.0_3.0_1687370922012.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_beto_gn_base_cased_es_5.0.0_3.0_1687370922012.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.beto_gn_base_cased").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_beto_gn_base_cased","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.beto_gn_base_cased").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_beto_gn_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|408.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md new file mode 100644 index 00000000000000..259fb891ba3560 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_bioclinicalbert_finetuned_covid_papers_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: bert_embeddings_bioclinicalbert_finetuned_covid_papers +date: 2023-06-21 +tags: [en, open_source, bert, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bioclinicalBERT-finetuned-covid-papers` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.0_1687337369326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_bioclinicalbert_finetuned_covid_papers_en_5.0.0_3.0_1687337369326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.covid_bio_clinical.finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_bioclinicalbert_finetuned_covid_papers","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.covid_bio_clinical.finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_bioclinicalbert_finetuned_covid_papers| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|403.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md new file mode 100644 index 00000000000000..2f13e1e7ba8bd4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_carlbert_webex_mlm_spatial_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from aditeyabaral) +author: John Snow Labs +name: bert_embeddings_carlbert_webex_mlm_spatial +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `carlbert-webex-mlm-spatial` is a English model originally trained by `aditeyabaral`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.0_1687334153231.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_carlbert_webex_mlm_spatial_en_5.0.0_3.0_1687334153231.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_carlbert_webex_mlm_spatial","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_carlbert_webex_mlm_spatial| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md new file mode 100644 index 00000000000000..1e2661ff475de6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chefberto_italian_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Embeddings (Base, Recipees) +author: John Snow Labs +name: bert_embeddings_chefberto_italian_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `chefberto-italian-cased` is a Italian model orginally trained by `vinhood`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chefberto_italian_cased_it_5.0.0_3.0_1687371210449.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chefberto_italian_cased_it_5.0.0_3.0_1687371210449.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.chefberto_italian_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chefberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.chefberto_italian_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chefberto_italian_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md new file mode 100644 index 00000000000000..8b4dcbb11e39e4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_bert_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Base, Uncased, Chemical) +author: John Snow Labs +name: bert_embeddings_chemical_bert_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `chemical-bert-uncased` is a English model orginally trained by `recobo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_bert_uncased_en_5.0.0_3.0_1687370963306.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_bert_uncased_en_5.0.0_3.0_1687370963306.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.chemical_bert_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_bert_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.chemical_bert_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_bert_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md new file mode 100644 index 00000000000000..1d3affe542dc5c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Shafin) +author: John Snow Labs +name: bert_embeddings_chemical_uncased_finetuned_cust_c1_cust +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `chemical-bert-uncased-finetuned-cust-c1-cust` is a English model originally trained by `Shafin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.0_1687335830911.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c1_cust_en_5.0.0_3.0_1687335830911.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c1_cust","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_uncased_finetuned_cust_c1_cust| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md new file mode 100644 index 00000000000000..02fdd7d0894952 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_chemical_uncased_finetuned_cust_c2_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Shafin) +author: John Snow Labs +name: bert_embeddings_chemical_uncased_finetuned_cust_c2 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `chemical-bert-uncased-finetuned-cust-c2` is a English model originally trained by `shafin`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.0_1687335658105.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_chemical_uncased_finetuned_cust_c2_en_5.0.0_3.0_1687335658105.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_chemical_uncased_finetuned_cust_c2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_chemical_uncased_finetuned_cust_c2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md new file mode 100644 index 00000000000000..60503fbe2496d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_childes_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from smeylan) +author: John Snow Labs +name: bert_embeddings_childes_bert +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `childes-bert` is a English model orginally trained by `smeylan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_childes_bert_en_5.0.0_3.0_1687371245330.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_childes_bert_en_5.0.0_3.0_1687371245330.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.childes_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_childes_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.childes_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_childes_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md new file mode 100644 index 00000000000000..ecc100c7def6f7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_128_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical English Bert Embeddings (Base, 128 dimension) +author: John Snow Labs +name: bert_embeddings_clinical_pubmed_bert_base_128 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `clinical-pubmed-bert-base-128` is a English model orginally trained by `Tsubasaz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_128_en_5.0.0_3.0_1687342663053.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_128_en_5.0.0_3.0_1687342663053.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_128").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_128","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_128").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_clinical_pubmed_bert_base_128| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md new file mode 100644 index 00000000000000..2c83ed64e84a41 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_clinical_pubmed_bert_base_512_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical English Bert Embeddings (Base, 512 dimension) +author: John Snow Labs +name: bert_embeddings_clinical_pubmed_bert_base_512 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `clinical-pubmed-bert-base-512` is a English model orginally trained by `Tsubasaz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_512_en_5.0.0_3.0_1687341838471.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_clinical_pubmed_bert_base_512_en_5.0.0_3.0_1687341838471.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_512").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_clinical_pubmed_bert_base_512","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.clinical_pubmed_bert_base_512").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_clinical_pubmed_bert_base_512| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md new file mode 100644 index 00000000000000..89a6839a95ee56 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_crosloengual_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Multilingual (Croatian, Slovenian, English) Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_crosloengual_bert +date: 2023-06-21 +tags: [bert, embeddings, en, hr, sl, xx, multilingual, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `crosloengual-bert` is a English model orginally trained by `EMBEDDIA`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_crosloengual_bert_en_5.0.0_3.0_1687341501117.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_crosloengual_bert_en_5.0.0_3.0_1687341501117.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.crosloengual_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_crosloengual_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.crosloengual_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_crosloengual_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|463.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md new file mode 100644 index 00000000000000..0c6f71e2a0ddb4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dbert_ko.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Korean Bert Embeddings (from deeq) +author: John Snow Labs +name: bert_embeddings_dbert +date: 2023-06-21 +tags: [bert, embeddings, ko, open_source, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dbert` is a Korean model orginally trained by `deeq`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dbert_ko_5.0.0_3.0_1687341138674.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dbert_ko_5.0.0_3.0_1687341138674.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.dbert").predict("""나는 Spark NLP를 좋아합니다""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dbert","ko") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ko.embed.dbert").predict("""나는 Spark NLP를 좋아합니다""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|421.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md new file mode 100644 index 00000000000000..22fae6bd9c819c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_deberta_base_uncased_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: English BertForMaskedLM Base Uncased model (from mlcorelib) +author: John Snow Labs +name: bert_embeddings_deberta_base_uncased +date: 2023-06-21 +tags: [en, open_source, bert_embeddings, bertformaskedlm, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `deberta-base-uncased` is a English model originally trained by `mlcorelib`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_deberta_base_uncased_en_5.0.0_3.0_1687341134871.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_deberta_base_uncased_en_5.0.0_3.0_1687341134871.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_base_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_deberta_base_uncased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_base_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_deberta_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md new file mode 100644 index 00000000000000..e29412f032a148 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_distil_clinical_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from nlpie) +author: John Snow Labs +name: bert_embeddings_distil_clinical +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `distil-clinicalbert` is a English model originally trained by `nlpie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.0_1687334036385.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_distil_clinical_en_5.0.0_3.0_1687334036385.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_distil_clinical","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark-NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_distil_clinical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|244.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md new file mode 100644 index 00000000000000..622e96cea211e0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Pasage, Allqa) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_passage_encoder_allqa_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-passage_encoder-allqa-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es_5.0.0_3.0_1687341854288.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_allqa_base_es_5.0.0_3.0_1687341854288.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_passage_encoder_allqa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md new file mode 100644 index 00000000000000..5b4101b177a992 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_passage_encoder_squades_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Pasage, Squades) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_passage_encoder_squades_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-passage_encoder-squades-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_squades_base_es_5.0.0_3.0_1687341276775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_passage_encoder_squades_base_es_5.0.0_3.0_1687341276775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_passage_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_passage_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_passage_encoder_squades_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md new file mode 100644 index 00000000000000..fbf26507302d99 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_allqa_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Question, Allqa) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_question_encoder_allqa_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-question_encoder-allqa-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_allqa_base_es_5.0.0_3.0_1687340961201.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_allqa_base_es_5.0.0_3.0_1687340961201.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_allqa_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_allqa_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_question_encoder_allqa_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md new file mode 100644 index 00000000000000..537f94ae494528 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dpr_spanish_question_encoder_squades_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Bert Embeddings (Base, Question, Squades) +author: John Snow Labs +name: bert_embeddings_dpr_spanish_question_encoder_squades_base +date: 2023-06-21 +tags: [bert, embeddings, es, open_source, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dpr-spanish-question_encoder-squades-base` is a Spanish model orginally trained by `IIC`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_squades_base_es_5.0.0_3.0_1687341460131.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dpr_spanish_question_encoder_squades_base_es_5.0.0_3.0_1687341460131.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Me encanta chispa nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dpr_spanish_question_encoder_squades_base","es") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Me encanta chispa nlp").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.dpr_spanish_question_encoder_squades_base").predict("""Me encanta chispa nlp""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dpr_spanish_question_encoder_squades_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|409.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md new file mode 100644 index 00000000000000..4a1f66ee841cec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_dziribert_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from alger-ia) +author: John Snow Labs +name: bert_embeddings_dziribert +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `dziribert` is a Arabic model orginally trained by `alger-ia`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_dziribert_ar_5.0.0_3.0_1687341113062.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_dziribert_ar_5.0.0_3.0_1687341113062.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.dziribert").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_dziribert","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.dziribert").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_dziribert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|462.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md new file mode 100644 index 00000000000000..8bee08b62d0c53 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Uncased) +author: John Snow Labs +name: bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `false-positives-scancode-bert-base-uncased-L8-1` is a English model orginally trained by `ayansinha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en_5.0.0_3.0_1687340166023.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1_en_5.0.0_3.0_1687340166023.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.false_positives_scancode_bert_base_uncased_L8_1").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.false_positives_scancode_bert_base_uncased_L8_1").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_false_positives_scancode_bert_base_uncased_L8_1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md new file mode 100644 index 00000000000000..a8f6efb7ae0030 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finbert_pretrain_yiyanghkust_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: Financial English Bert Embeddings (Base, Communication texts) +author: John Snow Labs +name: bert_embeddings_finbert_pretrain_yiyanghkust +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Financial English Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `finbert-pretrain-yiyanghkust` is a English model orginally available in Hugging Face as `yiyanghkust/finbert-pretrain`. It was trained on the following datasets: + +- Corporate Reports 10-K & 10-Q: 2.5B tokens +- Earnings Call Transcripts: 1.3B tokens +- Analyst Reports: 1.1B tokens + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340890257.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finbert_pretrain_yiyanghkust_en_5.0.0_3.0_1687340890257.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finbert_pretrain_yiyanghkust").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finbert_pretrain_yiyanghkust","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finbert_pretrain_yiyanghkust").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_finbert_pretrain_yiyanghkust| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md new file mode 100644 index 00000000000000..e01933d053546d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_finest_bert_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Multilingual (Finnish, Estonian, English) Bert Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_finest_bert +date: 2023-06-21 +tags: [bert, embeddings, fi, et, en, xx, multilingual, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `finest-bert` is a English model orginally trained by `EMBEDDIA`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.0_1687339089124.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_finest_bert_en_5.0.0_3.0_1687339089124.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finest_bert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_finest_bert","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.finest_bert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_finest_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|535.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md new file mode 100644 index 00000000000000..f7c53a48bd8905 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_gbert_base_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Bert Embeddings (Base, Cased) +author: John Snow Labs +name: bert_embeddings_gbert_base +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `gbert-base` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.0_1687339723694.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_gbert_base_de_5.0.0_3.0_1687339723694.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.gbert_base").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_gbert_base","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.gbert_base").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_gbert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|409.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md new file mode 100644 index 00000000000000..bd29b725b81ae5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_german_financial_statements_bert_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Financial Bert Word Embeddings +author: John Snow Labs +name: bert_embeddings_german_financial_statements_bert +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Bert Word Embeddings model, trained on German Financial Statements. Uploaded to Hugging Face, adapted and imported into Spark NLP. `german-financial-statements-bert` is a German Financial model orginally trained upon 100,000 natural language annual financial statements. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.0_1687339007310.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_german_financial_statements_bert_de_5.0.0_3.0_1687339007310.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.german_financial_statements_bert").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_german_financial_statements_bert","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.german_financial_statements_bert").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_german_financial_statements_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md new file mode 100644 index 00000000000000..82f3c85d2247d3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hateBERT_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from GroNLP) +author: John Snow Labs +name: bert_embeddings_hateBERT +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `hateBERT` is a English model orginally trained by `GroNLP`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.0_1687340123478.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hateBERT_en_5.0.0_3.0_1687340123478.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.hateBERT").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hateBERT","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.hateBERT").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_hateBERT| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md new file mode 100644 index 00000000000000..0ac43ca8f34236 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_hseBert_it_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Bert Embeddings (from bullmount) +author: John Snow Labs +name: bert_embeddings_hseBert_it_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `hseBert-it-cased` is a Italian model orginally trained by `bullmount`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_hseBert_it_cased_it_5.0.0_3.0_1687340783377.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_hseBert_it_cased_it_5.0.0_3.0_1687340783377.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.hseBert_it_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_hseBert_it_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.hseBert_it_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_hseBert_it_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md new file mode 100644 index 00000000000000..cb1d065f6341ec --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_hi_bert_hi.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Hindi Bert Embeddings +author: John Snow Labs +name: bert_embeddings_indic_transformers_hi_bert +date: 2023-06-21 +tags: [bert, embeddings, hi, open_source, onnx] +task: Embeddings +language: hi +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-hi-bert` is a Hindi model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.0_1687339963111.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_hi_bert_hi_5.0.0_3.0_1687339963111.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_bert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मुझे स्पार्क एनएलपी पसंद है"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_hi_bert","hi") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मुझे स्पार्क एनएलपी पसंद है").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("hi.embed.indic_transformers_hi_bert").predict("""मुझे स्पार्क एनएलपी पसंद है""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_indic_transformers_hi_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|hi| +|Size:|609.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md new file mode 100644 index 00000000000000..286777564e9003 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_indic_transformers_te_bert_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu Bert Embeddings (from neuralspace-reverie) +author: John Snow Labs +name: bert_embeddings_indic_transformers_te_bert +date: 2023-06-21 +tags: [bert, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `indic-transformers-te-bert` is a Telugu model orginally trained by `neuralspace-reverie`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.0_1687340459352.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_indic_transformers_te_bert_te_5.0.0_3.0_1687340459352.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_bert").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_indic_transformers_te_bert","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.indic_transformers_te_bert").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_indic_transformers_te_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|609.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md new file mode 100644 index 00000000000000..9f834ccdc83fa0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_imdb_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese Bert Embeddings (Small, Imdb) +author: John Snow Labs +name: bert_embeddings_javanese_bert_small_imdb +date: 2023-06-21 +tags: [bert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-bert-small-imdb` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_imdb_jv_5.0.0_3.0_1687341195384.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_imdb_jv_5.0.0_3.0_1687341195384.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small_imdb").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small_imdb","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small_imdb").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_javanese_bert_small_imdb| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|407.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md new file mode 100644 index 00000000000000..92986039f002be --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_javanese_bert_small_jv.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Javanese Bert Embeddings (Small, Wikipedia) +author: John Snow Labs +name: bert_embeddings_javanese_bert_small +date: 2023-06-21 +tags: [bert, embeddings, jv, open_source, onnx] +task: Embeddings +language: jv +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `javanese-bert-small` is a Javanese model orginally trained by `w11wo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.0_1687339377809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_javanese_bert_small_jv_5.0.0_3.0_1687339377809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_javanese_bert_small","jv") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("jv.embed.javanese_bert_small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_javanese_bert_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|jv| +|Size:|407.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md new file mode 100644 index 00000000000000..ac620f546f44ce --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_jobbert_base_cased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings (from jjzha) +author: John Snow Labs +name: bert_embeddings_jobbert_base_cased +date: 2023-06-21 +tags: [bert, en, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `jobbert-base-cased ` is a English model originally trained by `jjzha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.0_1687336524220.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_jobbert_base_cased_en_5.0.0_3.0_1687336524220.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.cased_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_jobbert_base_cased","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.bert.cased_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_jobbert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|402.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md new file mode 100644 index 00000000000000..a490d46ea569a7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legal_bert_base_uncased_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Legal English Bert Embeddings (Base, Uncased) +author: John Snow Labs +name: bert_embeddings_legal_bert_base_uncased +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, legal, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Legal Pretrained Bert Embeddings model, trained with uncased text, uploaded to Hugging Face, adapted and imported into Spark NLP. `legal-bert-base-uncased` is a English model orginally trained by `nlpaueb`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legal_bert_base_uncased_en_5.0.0_3.0_1687341978829.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legal_bert_base_uncased_en_5.0.0_3.0_1687341978829.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_bert_base_uncased").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legal_bert_base_uncased","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_bert_base_uncased").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_legal_bert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md new file mode 100644 index 00000000000000..5dc84ccc68d550 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_legalbert_adept_en.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English Legal BERT Embeddings +author: John Snow Labs +name: bert_embeddings_legalbert_adept +date: 2023-06-21 +tags: [bert, en, english, embeddings, transformer, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalbert-adept` is a English model originally trained by `hatemestinbejaia`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.0_1687335917569.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_legalbert_adept_en_5.0.0_3.0_1687335917569.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_legalbert_adept","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_legalbert_adept| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|407.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md new file mode 100644 index 00000000000000..6e0309525b771a --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (Cased) +author: John Snow Labs +name: bert_embeddings_lic_class_scancode_bert_base_cased_L32_1 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `lic-class-scancode-bert-base-cased-L32-1` is a English model orginally trained by `ayansinha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en_5.0.0_3.0_1687351576851.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lic_class_scancode_bert_base_cased_L32_1_en_5.0.0_3.0_1687351576851.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.lic_class_scancode_bert_base_cased_L32_1").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lic_class_scancode_bert_base_cased_L32_1","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.lic_class_scancode_bert_base_cased_L32_1").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_lic_class_scancode_bert_base_cased_L32_1| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md new file mode 100644 index 00000000000000..022feacea52d3c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_lsg16k_Italian_Legal_it.md @@ -0,0 +1,135 @@ +--- +layout: model +title: English Legal BERT Embeddings +author: John Snow Labs +name: bert_embeddings_lsg16k_Italian_Legal +date: 2023-06-21 +tags: [longformer, it, italian, embeddings, transformer, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `lsg16k-Italian-Legal-BERT` is a Italian model originally trained by `dlicari`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.0_1687335744395.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_lsg16k_Italian_Legal_it_5.0.0_3.0_1687335744395.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = nlp.DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = nlp.Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = nlp.Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_lsg16k_Italian_Legal","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_lsg16k_Italian_Legal| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|454.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md new file mode 100644 index 00000000000000..6b39af7927411f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_marathi_bert_mr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Marathi Bert Embeddings +author: John Snow Labs +name: bert_embeddings_marathi_bert +date: 2023-06-21 +tags: [bert, embeddings, mr, open_source, onnx] +task: Embeddings +language: mr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `marathi-bert` is a Marathi model orginally trained by `l3cube-pune`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_marathi_bert_mr_5.0.0_3.0_1687350857061.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_marathi_bert_mr_5.0.0_3.0_1687350857061.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.marathi_bert").predict("""मला स्पार्क एनएलपी आवडते""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["मला स्पार्क एनएलपी आवडते"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_marathi_bert","mr") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("मला स्पार्क एनएलपी आवडते").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("mr.embed.marathi_bert").predict("""मला स्पार्क एनएलपी आवडते""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_marathi_bert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|mr| +|Size:|665.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md new file mode 100644 index 00000000000000..3d6a26c549f735 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_mbert_ar_c19_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (Mbert model, Covid-19) +author: John Snow Labs +name: bert_embeddings_mbert_ar_c19 +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `mbert_ar_c19` is a Arabic model orginally trained by `moha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_mbert_ar_c19_ar_5.0.0_3.0_1687351164607.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_mbert_ar_c19_ar_5.0.0_3.0_1687351164607.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.mbert_ar_c19").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_mbert_ar_c19","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.mbert_ar_c19").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_mbert_ar_c19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|624.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md new file mode 100644 index 00000000000000..bc40d7959ba191 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_multi_dialect_bert_base_arabic_ar.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Arabic Bert Embeddings (from bashar-talafha) +author: John Snow Labs +name: bert_embeddings_multi_dialect_bert_base_arabic +date: 2023-06-21 +tags: [bert, embeddings, ar, open_source, onnx] +task: Embeddings +language: ar +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `multi-dialect-bert-base-arabic` is a Arabic model orginally trained by `bashar-talafha`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_multi_dialect_bert_base_arabic_ar_5.0.0_3.0_1687351229326.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_multi_dialect_bert_base_arabic_ar_5.0.0_3.0_1687351229326.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.multi_dialect_bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["أنا أحب شرارة NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_multi_dialect_bert_base_arabic","ar") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("أنا أحب شرارة NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("ar.embed.multi_dialect_bert_base_arabic").predict("""أنا أحب شرارة NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_multi_dialect_bert_base_arabic| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ar| +|Size:|411.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md new file mode 100644 index 00000000000000..2edd51e298a1d8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_netbert_en.md @@ -0,0 +1,153 @@ +--- +layout: model +title: English BertForMaskedLM Cased model (from antoinelouis) +author: John Snow Labs +name: bert_embeddings_netbert +date: 2023-06-21 +tags: [en, open_source, bert_embeddings, bertformaskedlm, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertForMaskedLM model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `netbert` is a English model originally trained by `antoinelouis`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_netbert_en_5.0.0_3.0_1687351022341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_netbert_en_5.0.0_3.0_1687351022341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.netbert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, bert_loaded]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val bert_loaded = BertEmbeddings.pretrained("bert_embeddings_netbert","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, bert_loaded)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.netbert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_netbert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md new file mode 100644 index 00000000000000..3566124863b301 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_olm_base_uncased_oct_2022_en.md @@ -0,0 +1,140 @@ +--- +layout: model +title: English Bert Embeddings Cased model (from Tristan) +author: John Snow Labs +name: bert_embeddings_olm_base_uncased_oct_2022 +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `olm-bert-base-uncased-oct-2022` is a English model originally trained by `Tristan`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.0_1687336305222.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_olm_base_uncased_oct_2022_en_5.0.0_3.0_1687336305222.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_olm_base_uncased_oct_2022","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_olm_base_uncased_oct_2022| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|464.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md new file mode 100644 index 00000000000000..0308fcbcd07582 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_pretrain_ko.md @@ -0,0 +1,140 @@ +--- +layout: model +title: Korean Bert Embeddings Cased model (from onlydj96) +author: John Snow Labs +name: bert_embeddings_pretrain +date: 2023-06-21 +tags: [open_source, bert, bert_embeddings, bertformaskedlm, ko, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `bert_pretrain` is a Korean model originally trained by `onlydj96`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.0_1687336252702.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_pretrain_ko_5.0.0_3.0_1687336252702.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_pretrain","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(true) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_pretrain| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|ko| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md new file mode 100644 index 00000000000000..f12d896dcbc372 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_psych_search_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Bert Embeddings (from nlp4good) +author: John Snow Labs +name: bert_embeddings_psych_search +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `psych-search` is a English model orginally trained by `nlp4good`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_psych_search_en_5.0.0_3.0_1687350768319.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_psych_search_en_5.0.0_3.0_1687350768319.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.psych_search").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_psych_search","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.psych_search").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_psych_search| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|409.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md new file mode 100644 index 00000000000000..1e641bcf5d79b3 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_scibert_scivocab_finetuned_cord19_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English BERT Embeddings Cased model (from mrm8488) +author: John Snow Labs +name: bert_embeddings_scibert_scivocab_finetuned_cord19 +date: 2023-06-21 +tags: [en, open_source, bert, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BERT Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `scibert_scivocab-finetuned-CORD19` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.0_1687336817133.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_scibert_scivocab_finetuned_cord19_en_5.0.0_3.0_1687336817133.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.scibert.cord19_scibert.finetuned").predict("""PUT YOUR STRING HERE""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["PUT YOUR STRING HERE"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_scibert_scivocab_finetuned_cord19","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("PUT YOUR STRING HERE").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.scibert.cord19_scibert.finetuned").predict("""PUT YOUR STRING HERE""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_scibert_scivocab_finetuned_cord19| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|409.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md new file mode 100644 index 00000000000000..8e1158bbb801c8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_base_en.md @@ -0,0 +1,154 @@ +--- +layout: model +title: Financial English BERT Embeddings (Base) +author: John Snow Labs +name: bert_embeddings_sec_bert_base +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, financial, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sec-bert-base` is a English model orginally trained by `nlpaueb`. This is the reference base model, what means it uses the same architecture as BERT-BASE trained on financial documents. + +If you are interested in Financial Embeddings, take a look also at these two models: + +- [sec-num](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_num_en_3_0.html): Same as this base model but we replace every number token with a [NUM] pseudo-token handling all numeric expressions in a uniform manner, disallowing their fragmentation). +- [sec-shape](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_sh_en_3_0.html): Same as this base model but we replace numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented, e.g., '53.2' becomes '[XX.X]' and '40,200.5' becomes '[XX,XXX.X]'. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.0_1687339042219.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_base_en_5.0.0_3.0_1687339042219.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sec_bert_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md new file mode 100644 index 00000000000000..0958df101c9131 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sec_bert_sh_en.md @@ -0,0 +1,155 @@ +--- +layout: model +title: Financial English BERT Embeddings (Number shape masking) +author: John Snow Labs +name: bert_embeddings_sec_bert_sh +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, financial, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sec-bert-shape` is a English model orginally trained by `nlpaueb`.This model is the same as Bert Base but we replace numbers with pseudo-tokens that represent the number’s shape, so numeric expressions (of known shapes) are no longer fragmented, e.g., '53.2' becomes '[XX.X]' and '40,200.5' becomes '[XX,XXX.X]'. + +If you are interested in Financial Embeddings, take a look also at these two models: + +- [sec-base](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_base_en_3_0.html): Same as BERT Base but trained with financial documents. +- [sec-num](https://nlp.johnsnowlabs.com/2022/04/12/bert_embeddings_sec_bert_num_en_3_0.html): Same as Bert sec-base but we replace every number token with a [NUM] pseudo-token handling all numeric expressions in a uniform manner, disallowing their fragmentation). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.0_1687339128341.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sec_bert_sh_en_5.0.0_3.0_1687339128341.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_sh").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sec_bert_sh","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.sec_bert_sh").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sec_bert_sh| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|406.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md new file mode 100644 index 00000000000000..ce8ab67067aa3f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikubert_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (Siku Quanshu corpus) +author: John Snow Labs +name: bert_embeddings_sikubert +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sikubert` is a Chinese model orginally trained by `SIKU-BERT`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikubert_zh_5.0.0_3.0_1687343740087.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikubert_zh_5.0.0_3.0_1687343740087.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikubert").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikubert","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikubert").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sikubert| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|406.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md new file mode 100644 index 00000000000000..68d128c492b7cd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_sikuroberta_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (from SIKU-BERT) +author: John Snow Labs +name: bert_embeddings_sikuroberta +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `sikuroberta` is a Chinese model orginally trained by `SIKU-BERT`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikuroberta_zh_5.0.0_3.0_1687343322944.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_sikuroberta_zh_5.0.0_3.0_1687343322944.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikuroberta").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_sikuroberta","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.sikuroberta").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_sikuroberta| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|405.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md new file mode 100644 index 00000000000000..a521a954fb4236 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_telugu_bertu_te.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Telugu Bert Embeddings +author: John Snow Labs +name: bert_embeddings_telugu_bertu +date: 2023-06-21 +tags: [bert, embeddings, te, open_source, onnx] +task: Embeddings +language: te +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `telugu_bertu` is a Telugu model orginally trained by `kuppuluri`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_telugu_bertu_te_5.0.0_3.0_1687343021533.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_telugu_bertu_te_5.0.0_3.0_1687343021533.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.telugu_bertu").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_telugu_bertu","te") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("te.embed.telugu_bertu").predict("""నేను స్పార్క్ nlp ను ప్రేమిస్తున్నాను""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_telugu_bertu| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|te| +|Size:|412.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md new file mode 100644 index 00000000000000..64d487ee4850f5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wineberto_italian_cased_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Embeddings (Base, Wines description) +author: John Snow Labs +name: bert_embeddings_wineberto_italian_cased +date: 2023-06-21 +tags: [bert, embeddings, it, open_source, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `wineberto-italian-cased` is a Italian model orginally trained by `vinhood`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687343289463.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wineberto_italian_cased_it_5.0.0_3.0_1687343289463.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.wineberto_italian_cased").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wineberto_italian_cased","it") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.wineberto_italian_cased").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_wineberto_italian_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|it| +|Size:|412.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md new file mode 100644 index 00000000000000..698131409af3a5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_embeddings_wobert_chinese_plus_zh.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Chinese Bert Embeddings (from qinluo) +author: John Snow Labs +name: bert_embeddings_wobert_chinese_plus +date: 2023-06-21 +tags: [bert, embeddings, zh, open_source, onnx] +task: Embeddings +language: zh +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Bert Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `wobert-chinese-plus` is a Chinese model orginally trained by `qinluo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_embeddings_wobert_chinese_plus_zh_5.0.0_3.0_1687343185496.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_embeddings_wobert_chinese_plus_zh_5.0.0_3.0_1687343185496.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.wobert_chinese_plus").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_embeddings_wobert_chinese_plus","zh") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("zh.embed.wobert_chinese_plus").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_embeddings_wobert_chinese_plus| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|zh| +|Size:|464.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md new file mode 100644 index 00000000000000..13d2898c521b50 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-bert_sentence_embeddings_financial_de.md @@ -0,0 +1,151 @@ +--- +layout: model +title: German Financial Bert Word Embeddings +author: John Snow Labs +name: bert_sentence_embeddings_financial +date: 2023-06-21 +tags: [bert, embeddings, de, open_source, financial, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Although in the name of the model you will see the word `sentence`, this is a Word Embeddings Model. + +Financial Pretrained BERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `german-financial-statements-bert` is a German model orginally trained by `fabianrausch`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.0_1687338810949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_sentence_embeddings_financial_de_5.0.0_3.0_1687338810949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert.finance").predict("""Ich liebe Spark-NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark-NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("bert_sentence_embeddings_financial","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark-NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.bert.finance").predict("""Ich liebe Spark-NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_sentence_embeddings_financial| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|406.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md new file mode 100644 index 00000000000000..a38274560def80 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_all_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddings (Biomedical and Clinical) +author: John Snow Labs +name: biobert_embeddings_all +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-all` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_all_pt_5.0.0_3.0_1687342387740.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_all_pt_5.0.0_3.0_1687342387740.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_all").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_all","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_all").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_all| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|664.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md new file mode 100644 index 00000000000000..b86fe840106cfa --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_biomedical_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddiongs (Biomedical) +author: John Snow Labs +name: biobert_embeddings_biomedical +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-bio` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_biomedical_pt_5.0.0_3.0_1687343400949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_biomedical_pt_5.0.0_3.0_1687343400949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_biomedical").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_biomedical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_biomedical").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_biomedical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|665.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md new file mode 100644 index 00000000000000..d8a80c64c56b34 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_embeddings_clinical_pt.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Clinical Portuguese Bert Embeddiongs (Clinical) +author: John Snow Labs +name: biobert_embeddings_clinical +date: 2023-06-21 +tags: [biobert, embeddings, pt, open_source, onnx] +task: Embeddings +language: pt +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BioBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `biobertpt-clin` is a Portuguese model orginally trained by `pucpr`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_embeddings_clinical_pt_5.0.0_3.0_1687342893170.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_embeddings_clinical_pt_5.0.0_3.0_1687342893170.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_clinical").predict("""Odeio o cancro""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Odeio o cancro"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_embeddings_clinical","pt") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Odeio o cancro").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("pt.embed.gs_clinical").predict("""Odeio o cancro""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_embeddings_clinical| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|pt| +|Size:|665.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md new file mode 100644 index 00000000000000..794836f2f29942 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-biobert_pubmed_base_cased_v1.2_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: BioBERT Embeddings (Pubmed) +author: John Snow Labs +name: biobert_pubmed_base_cased_v1.2 +date: 2023-06-21 +tags: [bert, embeddings, en, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is the v1.2 of [biobert_pubmed_base_cased](https://nlp.johnsnowlabs.com/2020/09/19/biobert_pubmed_base_cased.html) model and contains pre-trained weights of BioBERT, a language representation model for biomedical domain, especially designed for biomedical text mining tasks such as biomedical named entity recognition, relation extraction, question answering, etc. The details are described in the paper "[BioBERT: a pre-trained biomedical language representation model for biomedical text mining](https://arxiv.org/abs/1901.08746v2)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.0_1687336480762.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/biobert_pubmed_base_cased_v1.2_en_5.0.0_3.0_1687336480762.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I hate cancer"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I hate cancer").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.biobert.pubmed.cased_base").predict("""I hate cancer""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I hate cancer"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased_v1.2","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I hate cancer").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.biobert.pubmed.cased_base").predict("""I hate cancer""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|biobert_pubmed_base_cased_v1.2| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md new file mode 100644 index 00000000000000..eb920c7ffb15b9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_0_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_0_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-0-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.0_1687338403600.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_0_cased_generator_de_5.0.0_3.0_1687338403600.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_64d").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_0_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_64d").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_0_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|221.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md new file mode 100644 index 00000000000000..d6613cc7e8cf70 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_1000000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-1000000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.0_1687337566476.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_1000000_cased_generator_de_5.0.0_3.0_1687337566476.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_1000000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_1000000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_1000000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_1000000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md new file mode 100644 index 00000000000000..b1110b5c50cebd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_100000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-100000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.0_1687337430315.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_100000_cased_generator_de_5.0.0_3.0_1687337430315.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_100000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_100000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_100000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_100000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.0 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md new file mode 100644 index 00000000000000..e90872b5cf0574 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_200000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-200000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.0_1687337323809.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_200000_cased_generator_de_5.0.0_3.0_1687337323809.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_200000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_200000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_200000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_200000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md new file mode 100644 index 00000000000000..e9a244fe395167 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_300000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-300000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.0_1687337742127.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_300000_cased_generator_de_5.0.0_3.0_1687337742127.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_300000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_300000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_300000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_300000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md new file mode 100644 index 00000000000000..cf463da7d3ede6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_400000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-400000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.0_1687338531671.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_400000_cased_generator_de_5.0.0_3.0_1687338531671.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_400000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_400000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_400000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_400000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md new file mode 100644 index 00000000000000..0e0368cae00d7c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_500000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-500000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.0_1687337310787.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_500000_cased_generator_de_5.0.0_3.0_1687337310787.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_500000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_500000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_500000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_500000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md new file mode 100644 index 00000000000000..3f63d4ca68b519 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_600000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-600000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.0_1687338289447.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_600000_cased_generator_de_5.0.0_3.0_1687338289447.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_600000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_600000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_600000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_600000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md new file mode 100644 index 00000000000000..6ce95cc170b433 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_700000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-700000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.0_1687336559193.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_700000_cased_generator_de_5.0.0_3.0_1687336559193.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_700000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_700000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_700000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_700000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md new file mode 100644 index 00000000000000..b3e7e29f1c1fe9 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_800000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-800000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.0_1687336668760.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_800000_cased_generator_de_5.0.0_3.0_1687336668760.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_800000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_800000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_800000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_800000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md new file mode 100644 index 00000000000000..9e2a0a0531c231 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from stefan-it) +author: John Snow Labs +name: electra_embeddings_electra_base_gc4_64k_900000_cased_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-gc4-64k-900000-cased-generator` is a German model orginally trained by `stefan-it`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.0_1687336789214.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_gc4_64k_900000_cased_generator_de_5.0.0_3.0_1687336789214.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_900000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_gc4_64k_900000_cased_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.cased_base_gc4_64k_900000.by_stefan_it").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_gc4_64k_900000_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|222.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md new file mode 100644 index 00000000000000..e83a14d6f6ba62 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_base_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.0_1687337315482.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_generator_en_5.0.0_3.0_1687337315482.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.base").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|125.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md new file mode 100644 index 00000000000000..b650a25e829416 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_italian_xxl_cased_generator_it.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Italian Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_italian_xxl_cased_generator +date: 2023-06-21 +tags: [it, open_source, electra, embeddings, onnx] +task: Embeddings +language: it +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-italian-xxl-cased-generator` is a Italian model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.0_1687337384147.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_italian_xxl_cased_generator_it_5.0.0_3.0_1687337384147.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.electra.cased_xxl_base").predict("""Adoro Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Adoro Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_italian_xxl_cased_generator","it") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Adoro Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("it.embed.electra.cased_xxl_base").predict("""Adoro Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_italian_xxl_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|it| +|Size:|127.4 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md new file mode 100644 index 00000000000000..e41b06ae521f40 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_cased_generator_tr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Turkish Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_turkish_mc4_cased_generator +date: 2023-06-21 +tags: [tr, open_source, electra, embeddings, onnx] +task: Embeddings +language: tr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-turkish-mc4-cased-generator` is a Turkish model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.0_1687337596423.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_cased_generator_tr_5.0.0_3.0_1687337596423.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.cased_base").predict("""Spark NLP'yi seviyorum""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_cased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.cased_base").predict("""Spark NLP'yi seviyorum""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_turkish_mc4_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tr| +|Size:|129.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md new file mode 100644 index 00000000000000..41859cc555c6a2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Turkish Electra Embeddings (from dbmdz) +author: John Snow Labs +name: electra_embeddings_electra_base_turkish_mc4_uncased_generator +date: 2023-06-21 +tags: [tr, open_source, electra, embeddings, onnx] +task: Embeddings +language: tr +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-base-turkish-mc4-uncased-generator` is a Turkish model orginally trained by `dbmdz`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.0_1687337246703.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_base_turkish_mc4_uncased_generator_tr_5.0.0_3.0_1687337246703.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.uncased_base").predict("""Spark NLP'yi seviyorum""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLP'yi seviyorum"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_base_turkish_mc4_uncased_generator","tr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLP'yi seviyorum").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tr.embed.electra.uncased_base").predict("""Spark NLP'yi seviyorum""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_base_turkish_mc4_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tr| +|Size:|130.0 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md new file mode 100644 index 00000000000000..bbfb7f281e7a49 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_large_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_large_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-large-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.0_1687337805375.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_large_generator_en_5.0.0_3.0_1687337805375.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.large").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_large_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.large").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_large_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|191.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md new file mode 100644 index 00000000000000..dabe96a7d7b5a2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_generator_en.md @@ -0,0 +1,149 @@ +--- +layout: model +title: English Electra Embeddings (from google) +author: John Snow Labs +name: electra_embeddings_electra_small_generator +date: 2023-06-21 +tags: [en, open_source, electra, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-small-generator` is a English model orginally trained by `google`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.0_1687337729115.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_generator_en_5.0.0_3.0_1687337729115.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.small").predict("""I love Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_generator","en") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.electra.small").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|50.8 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md new file mode 100644 index 00000000000000..29d87ba99e8341 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_small_japanese_generator_ja.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Japanese Electra Embeddings (from Cinnamon) +author: John Snow Labs +name: electra_embeddings_electra_small_japanese_generator +date: 2023-06-21 +tags: [ja, open_source, electra, embeddings, onnx] +task: Embeddings +language: ja +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-small-japanese-generator` is a Japanese model orginally trained by `Cinnamon`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.0_1687338737717.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_small_japanese_generator_ja_5.0.0_3.0_1687338737717.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLPが大好きです"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLPが大好きです").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Spark NLPが大好きです"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_small_japanese_generator","ja") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Spark NLPが大好きです").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_small_japanese_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ja| +|Size:|51.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md new file mode 100644 index 00000000000000..84aeba9befe4a7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_cased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_base_cased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-base-cased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.0_1687338660491.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_cased_generator_tl_5.0.0_3.0_1687338660491.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_base").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_base").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_base_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|129.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md new file mode 100644 index 00000000000000..e48ebc2ee91dfd --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_base_uncased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_base_uncased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-base-uncased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.0_1687338703736.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_base_uncased_generator_tl_5.0.0_3.0_1687338703736.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_base").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_base_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_base").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_base_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|129.9 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md new file mode 100644 index 00000000000000..df146d8836dce7 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_cased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_small_cased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.4 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-small-cased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.0_1687338628903.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_cased_generator_tl_5.0.0_3.0_1687338628903.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_small").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_cased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.cased_small").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_small_cased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|18.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md new file mode 100644 index 00000000000000..58cea57ca00683 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electra_tagalog_small_uncased_generator_tl.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Tagalog Electra Embeddings (from jcblaise) +author: John Snow Labs +name: electra_embeddings_electra_tagalog_small_uncased_generator +date: 2023-06-21 +tags: [tl, open_source, electra, embeddings, onnx] +task: Embeddings +language: tl +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electra-tagalog-small-uncased-generator` is a Tagalog model orginally trained by `jcblaise`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.0_1687338586547.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electra_tagalog_small_uncased_generator_tl_5.0.0_3.0_1687338586547.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_small").predict("""Mahilig ako sa Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Mahilig ako sa Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electra_tagalog_small_uncased_generator","tl") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Mahilig ako sa Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("tl.embed.electra.uncased_small").predict("""Mahilig ako sa Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electra_tagalog_small_uncased_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|tl| +|Size:|18.2 MB| +|Case sensitive:|false| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md new file mode 100644 index 00000000000000..be35b303a82993 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_electricidad_base_generator_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Electra Uncased Embeddings (Oscar dataset) +author: John Snow Labs +name: electra_embeddings_electricidad_base_generator +date: 2023-06-21 +tags: [es, open_source, electra, embeddings, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `electricidad-base-generator` is a Spanish model orginally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.0_1687337686007.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_electricidad_base_generator_es_5.0.0_3.0_1687337686007.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.electra.base").predict("""Amo Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Amo Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_electricidad_base_generator","es") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Amo Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.electra.base").predict("""Amo Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_electricidad_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|es| +|Size:|126.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md new file mode 100644 index 00000000000000..af1b52acd3ac9e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_base_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from krevas) +author: John Snow Labs +name: electra_embeddings_finance_koelectra_base_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `finance-koelectra-base-generator` is a Korean model orginally trained by `krevas`. This is a Base model. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.0_1687337679070.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_base_generator_ko_5.0.0_3.0_1687337679070.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_finance_koelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|129.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md new file mode 100644 index 00000000000000..76acc791540c7c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_finance_koelectra_small_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from krevas) +author: John Snow Labs +name: electra_embeddings_finance_koelectra_small_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Financial Korean Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `finance-koelectra-small-generator` is a Korean model orginally trained by `krevas`. This is a small (sm) version. Other bigger versions are available. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.0_1687338677896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_finance_koelectra_small_generator_ko_5.0.0_3.0_1687338677896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_finance_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_finance_koelectra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|51.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md new file mode 100644 index 00000000000000..35d0fddb181a54 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_base_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from deepset) +author: John Snow Labs +name: electra_embeddings_gelectra_base_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `gelectra-base-generator` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.0_1687338626775.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_base_generator_de_5.0.0_3.0_1687338626775.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.base").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_base_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.base").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_gelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|127.6 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md new file mode 100644 index 00000000000000..6d2e16d4eeefb6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_gelectra_large_generator_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German Electra Embeddings (from deepset) +author: John Snow Labs +name: electra_embeddings_gelectra_large_generator +date: 2023-06-21 +tags: [de, open_source, electra, embeddings, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `gelectra-large-generator` is a German model orginally trained by `deepset`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.0_1687338033613.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_gelectra_large_generator_de_5.0.0_3.0_1687338033613.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.large").predict("""Ich liebe Spark NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_gelectra_large_generator","de") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.electra.large").predict("""Ich liebe Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_gelectra_large_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|de| +|Size:|193.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md new file mode 100644 index 00000000000000..53e791d787729c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.0_1687337873576.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_generator_ko_5.0.0_3.0_1687337873576.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|130.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md new file mode 100644 index 00000000000000..94addd53290ea6 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v2_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_v2_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-v2-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.0_1687337792559.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v2_generator_ko_5.0.0_3.0_1687337792559.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v2_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_v2_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|129.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md new file mode 100644 index 00000000000000..d08cb9f7252863 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_base_v3_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_base_v3_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-base-v3-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.0_1687337798528.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_base_v3_generator_ko_5.0.0_3.0_1687337798528.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_base_v3_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_base_v3_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|137.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md new file mode 100644 index 00000000000000..e323b62a1bbec4 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_koelectra_small_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from monologg) +author: John Snow Labs +name: electra_embeddings_koelectra_small_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `koelectra-small-generator` is a Korean model orginally trained by `monologg`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.0_1687338723919.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_koelectra_small_generator_ko_5.0.0_3.0_1687338723919.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_koelectra_small_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_koelectra_small_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|51.7 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md new file mode 100644 index 00000000000000..0aeb0d295a40c1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-electra_embeddings_kr_electra_generator_ko.md @@ -0,0 +1,135 @@ +--- +layout: model +title: Korean Electra Embeddings (from snunlp) +author: John Snow Labs +name: electra_embeddings_kr_electra_generator +date: 2023-06-21 +tags: [ko, open_source, electra, embeddings, onnx] +task: Embeddings +language: ko +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Electra Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `KR-ELECTRA-generator` is a Korean model orginally trained by `snunlp`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.0_1687338860027.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/electra_embeddings_kr_electra_generator_ko_5.0.0_3.0_1687338860027.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["나는 Spark NLP를 좋아합니다"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val embeddings = BertEmbeddings.pretrained("electra_embeddings_kr_electra_generator","ko") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("나는 Spark NLP를 좋아합니다").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|electra_embeddings_kr_electra_generator| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|ko| +|Size:|124.1 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md new file mode 100644 index 00000000000000..ed972bedaa7edf --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_base_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Legal Electra Word Embeddings Base model +author: John Snow Labs +name: legalectra_base +date: 2023-06-21 +tags: [open_source, legalectra, embeddings, electra, legal, es, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalectra-base-spanish` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.0_1687336669896.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_base_es_5.0.0_3.0_1687336669896.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_base","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_base","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.base_legal").predict("""Amo a Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_base","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_base","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.base_legal").predict("""Amo a Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legalectra_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|408.5 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md new file mode 100644 index 00000000000000..a2fb304397a6a0 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-legalectra_small_es.md @@ -0,0 +1,149 @@ +--- +layout: model +title: Spanish Electra Legal Word Embeddings Small model +author: John Snow Labs +name: legalectra_small +date: 2023-06-21 +tags: [open_source, legalectra, embeddings, electra, legal, small, es, onnx] +task: Embeddings +language: es +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Spanish Legal Word Embeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legalectra-small-spanish` is a English model originally trained by `mrm8488`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.0_1687336489949.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/legalectra_small_es_5.0.0_3.0_1687336489949.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_small","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_small","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.small_legal").predict("""Amo a Spark NLP.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +electra = BertEmbeddings.pretrained("legalectra_small","es") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, electra]) + +data = spark.createDataFrame([["Amo a Spark NLP."]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val tokenizer = new Tokenizer() + .setInputCols(Array("document")) + .setOutputCol("token") + +val electra = BertEmbeddings.pretrained("legalectra_small","es") + .setInputCols(Array("document", "token")) + .setOutputCol("class") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, electra)) + +val data = Seq("Amo a Spark NLP.").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("es.embed.bert.small_legal").predict("""Amo a Spark NLP.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|legalectra_small| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|es| +|Size:|51.2 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md new file mode 100644 index 00000000000000..bf7539ce60805d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-21-ms_bluebert_base_uncased_en.md @@ -0,0 +1,106 @@ +--- +layout: model +title: MS-BERT base model (uncased) +author: John Snow Labs +name: ms_bluebert_base_uncased +date: 2023-06-21 +tags: [embeddings, bert, open_source, en, clinical, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is trained by taking BlueBert as the base model, and training on dataset contained approximately 75,000 clinical notes, for about 5000 patients, totaling to over 35.7 million words. These notes were collected from patients who visited St. Michael's Hospital MS Clinic between 2015 to 2019. The notes contained a variety of information pertaining to a neurological exam. For example, a note can contain information on the patient's condition, their progress over time and diagnosis. + +BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it was pretrained with two objectives: + +Masked language modeling (MLM): taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. +Next sentence prediction (NSP): the models concatenate two masked sentences as inputs during pretraining. Sometimes they correspond to sentences that were next to each other in the original text, sometimes not. The model then has to predict if the two sentences were following each other or not. This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/ms_bluebert_base_uncased_en_5.0.0_3.0_1687372625112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/ms_bluebert_base_uncased_en_5.0.0_3.0_1687372625112.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` +
+ +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") \ + .setInputCols(["sentence", "token"]) \ + .setOutputCol("embeddings") + +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = BertEmbeddings.pretrained("ms_bluebert_base_uncased", "en") + .setInputCols("sentence", "token") + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` +
+ +## Results + +```bash +Results + + +Generates 768 dimensional embeddings per token + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|ms_bluebert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|408.2 MB| +|Case sensitive:|false| + +## References + +https://huggingface.co/NLP4H/ms_bert \ No newline at end of file