From 59113cdf3af52632b24414cbde6ec2b377bf7a40 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Sat, 1 Jul 2023 20:06:13 +0700 Subject: [PATCH] 2023-06-28-roberta_base_en (#13871) * Add model 2023-06-28-roberta_base_en * Add model 2023-06-28-roberta_base_opt_en * Add model 2023-06-28-roberta_base_quantized_en * Add model 2023-06-28-small_bert_L2_768_en * Add model 2023-06-28-small_bert_L2_768_opt_en * Add model 2023-06-28-small_bert_L2_768_quantized_en * Add model 2023-06-28-distilbert_base_cased_en * Add model 2023-06-28-distilbert_base_cased_opt_en * Add model 2023-06-28-distilbert_base_cased_quantized_en * Add model 2023-06-28-deberta_v3_base_en * Add model 2023-06-28-deberta_v3_base_opt_en * Add model 2023-06-28-deberta_v3_base_quantized_en * Add model 2023-06-28-distilbert_base_uncased_en * Add model 2023-06-28-distilbert_base_uncased_opt_en * Add model 2023-06-28-distilbert_base_uncased_quantized_en * Add model 2023-06-28-distilbert_base_multilingual_cased_xx * Add model 2023-06-28-distilbert_base_multilingual_cased_xx * Add model 2023-06-28-distilbert_base_multilingual_cased_opt_xx * Add model 2023-06-28-distilbert_base_multilingual_cased_quantized_xx * Add model 2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de * Add model 2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de * Add model 2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de * Add model 2023-06-29-bert_base_cased_en * Add model 2023-06-29-bert_base_cased_opt_en * Add model 2023-06-29-bert_base_cased_quantized_en --------- Co-authored-by: ahmedlone127 --- .../2023-06-28-deberta_v3_base_en.md | 100 ++++++++++++ .../2023-06-28-deberta_v3_base_opt_en.md | 100 ++++++++++++ ...2023-06-28-deberta_v3_base_quantized_en.md | 100 ++++++++++++ .../2023-06-28-distilbert_base_cased_en.md | 118 ++++++++++++++ ...2023-06-28-distilbert_base_cased_opt_en.md | 118 ++++++++++++++ ...6-28-distilbert_base_cased_quantized_en.md | 118 ++++++++++++++ ...stilbert_base_multilingual_cased_opt_xx.md | 117 ++++++++++++++ ...rt_base_multilingual_cased_quantized_xx.md | 117 ++++++++++++++ ...8-distilbert_base_multilingual_cased_xx.md | 117 ++++++++++++++ .../2023-06-28-distilbert_base_uncased_en.md | 118 ++++++++++++++ ...23-06-28-distilbert_base_uncased_opt_en.md | 118 ++++++++++++++ ...28-distilbert_base_uncased_quantized_en.md | 118 ++++++++++++++ ...eddings_distilbert_base_german_cased_de.md | 149 ++++++++++++++++++ ...ngs_distilbert_base_german_cased_opt_de.md | 149 ++++++++++++++++++ ...stilbert_base_german_cased_quantized_de.md | 149 ++++++++++++++++++ .../2023-06-28-roberta_base_en.md | 122 ++++++++++++++ .../2023-06-28-roberta_base_opt_en.md | 122 ++++++++++++++ .../2023-06-28-roberta_base_quantized_en.md | 122 ++++++++++++++ .../2023-06-28-small_bert_L2_768_en.md | 136 ++++++++++++++++ .../2023-06-28-small_bert_L2_768_opt_en.md | 136 ++++++++++++++++ ...23-06-28-small_bert_L2_768_quantized_en.md | 136 ++++++++++++++++ .../2023-06-29-bert_base_cased_en.md | 137 ++++++++++++++++ .../2023-06-29-bert_base_cased_opt_en.md | 137 ++++++++++++++++ ...2023-06-29-bert_base_cased_quantized_en.md | 137 ++++++++++++++++ 24 files changed, 2991 insertions(+) create mode 100644 docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md create mode 100644 docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md diff --git a/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md new file mode 100644 index 00000000000000..e67736379a0df8 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa base model +author: John Snow Labs +name: deberta_v3_base +date: 2023-06-28 +tags: [en, english, open_source, embeddings, deberta, v3, base, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_base_en_5.0.0_3.0_1687957496351.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_base_en_5.0.0_3.0_1687957496351.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|435.2 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md new file mode 100644 index 00000000000000..68d625e906f51c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_opt_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa base model +author: John Snow Labs +name: deberta_v3_base_opt +date: 2023-06-28 +tags: [en, english, open_source, embeddings, deberta, v3, base, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_base_opt_en_5.0.0_3.0_1687958380723.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_base_opt_en_5.0.0_3.0_1687958380723.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_base_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|469.3 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md new file mode 100644 index 00000000000000..6457327a744a22 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-deberta_v3_base_quantized_en.md @@ -0,0 +1,100 @@ +--- +layout: model +title: DeBERTa base model +author: John Snow Labs +name: deberta_v3_base_quantized +date: 2023-06-28 +tags: [en, english, open_source, embeddings, deberta, v3, base, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DeBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +The DeBERTa model was proposed in [[https://arxiv.org/abs/2006.03654 DeBERTa: Decoding-enhanced BERT with Disentangled Attention]] by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google’s BERT model released in 2018 and Facebook’s RoBERTa model released in 2019. Compared to RoBERTa-Large, a DeBERTa model trained on half of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/deberta_v3_base_quantized_en_5.0.0_3.0_1687958846162.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/deberta_v3_base_quantized_en_5.0.0_3.0_1687958846162.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = DeBertaEmbeddings.pretrained("deberta_v3_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.deberta_v3_base").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|deberta_v3_base_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|310.7 MB| +|Case sensitive:|true| +|Max sentence length:|128| + +## Benchmarking + +```bash +Benchmarking +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md new file mode 100644 index 00000000000000..2a6be63cb79d9b --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (cased) +author: John Snow Labs +name: distilbert_base_cased +date: 2023-06-28 +tags: [distilbert, en, english, open_source, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_en_5.0.0_3.0_1687955596708.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_en_5.0.0_3.0_1687955596708.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|243.8 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-cased](https://huggingface.co/distilbert-base-cased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 81.5 | 87.8 | 88.2 | 90.4 | 47.2 | 85.5 | 85.6 | 60.6 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md new file mode 100644 index 00000000000000..fb694045a0f578 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_opt_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (cased) +author: John Snow Labs +name: distilbert_base_cased_opt +date: 2023-06-28 +tags: [distilbert, en, english, open_source, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_opt_en_5.0.0_3.0_1687955659414.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_opt_en_5.0.0_3.0_1687955659414.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|243.8 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-cased](https://huggingface.co/distilbert-base-cased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 81.5 | 87.8 | 88.2 | 90.4 | 47.2 | 85.5 | 85.6 | 60.6 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md new file mode 100644 index 00000000000000..8ced215668327e --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_cased_quantized_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (cased) +author: John Snow Labs +name: distilbert_base_cased_quantized +date: 2023-06-28 +tags: [distilbert, en, english, open_source, embeddings, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_quantized_en_5.0.0_3.0_1687955697660.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_cased_quantized_en_5.0.0_3.0_1687955697660.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|111.1 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-cased](https://huggingface.co/distilbert-base-cased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 81.5 | 87.8 | 88.2 | 90.4 | 47.2 | 85.5 | 85.6 | 60.6 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md new file mode 100644 index 00000000000000..f7e2051a4eae33 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_opt_xx.md @@ -0,0 +1,117 @@ +--- +layout: model +title: DistilBERT base multilingual model (cased) +author: John Snow Labs +name: distilbert_base_multilingual_cased_opt +date: 2023-06-28 +tags: [distilbert, embeddings, xx, multilingual, open_source, onnx] +task: Embeddings +language: xx +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base multilingual model](bert-base-multilingual-cased). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. The model is trained on the concatenation of Wikipedia in 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). + +The model has 6 layers, 768 dimension,s and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_opt_xx_5.0.0_3.0_1687985561229.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_opt_xx_5.0.0_3.0_1687985561229.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_multilingual_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|505.4 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased) + +## Benchmarking + +```bash +Benchmarking + + +| Model | English | Spanish | Chinese | German | Arabic | Urdu | +| :---: | :---: | :---: | :---: | :---: | :---: | :---:| +| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 | +| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 | +| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 | + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md new file mode 100644 index 00000000000000..909e419bb06c8c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_quantized_xx.md @@ -0,0 +1,117 @@ +--- +layout: model +title: DistilBERT base multilingual model (cased) +author: John Snow Labs +name: distilbert_base_multilingual_cased_quantized +date: 2023-06-28 +tags: [distilbert, embeddings, xx, multilingual, open_source, onnx] +task: Embeddings +language: xx +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base multilingual model](bert-base-multilingual-cased). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. The model is trained on the concatenation of Wikipedia in 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). + +The model has 6 layers, 768 dimension,s and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_quantized_xx_5.0.0_3.0_1687985753267.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_quantized_xx_5.0.0_3.0_1687985753267.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_multilingual_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|371.7 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased) + +## Benchmarking + +```bash +Benchmarking + + +| Model | English | Spanish | Chinese | German | Arabic | Urdu | +| :---: | :---: | :---: | :---: | :---: | :---: | :---:| +| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 | +| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 | +| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 | + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md new file mode 100644 index 00000000000000..24e2b11a218e68 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_multilingual_cased_xx.md @@ -0,0 +1,117 @@ +--- +layout: model +title: DistilBERT base multilingual model (cased) +author: John Snow Labs +name: distilbert_base_multilingual_cased +date: 2023-06-28 +tags: [distilbert, embeddings, xx, multilingual, open_source, onnx] +task: Embeddings +language: xx +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base multilingual model](bert-base-multilingual-cased). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is cased: it does make a difference between english and English. The model is trained on the concatenation of Wikipedia in 104 different languages listed [here](https://github.com/google-research/bert/blob/master/multilingual.md#list-of-languages). + +The model has 6 layers, 768 dimension,s and 12 heads, totalizing 134M parameters (compared to 177M parameters for mBERT-base). On average DistilmBERT is twice as fast as mBERT-base. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_xx_5.0.0_3.0_1687985402696.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_multilingual_cased_xx_5.0.0_3.0_1687985402696.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_multilingual_cased", "xx") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("xx.embed.distilbert").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_multilingual_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|xx| +|Size:|505.4 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased) + +## Benchmarking + +```bash +Benchmarking + + +| Model | English | Spanish | Chinese | German | Arabic | Urdu | +| :---: | :---: | :---: | :---: | :---: | :---: | :---:| +| mBERT base cased (computed) | 82.1 | 74.6 | 69.1 | 72.3 | 66.4 | 58.5 | +| mBERT base uncased (reported)| 81.4 | 74.3 | 63.8 | 70.5 | 62.1 | 58.3 | +| DistilmBERT | 78.2 | 69.1 | 64.0 | 66.3 | 59.1 | 54.7 | + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md new file mode 100644 index 00000000000000..ee169a452abca5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (uncased) +author: John Snow Labs +name: distilbert_base_uncased +date: 2023-06-28 +tags: [distilbert, en, english, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is uncased: it does not make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_en_5.0.0_3.0_1687984618202.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_en_5.0.0_3.0_1687984618202.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_uncased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|247.2 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md new file mode 100644 index 00000000000000..61491f8bda0547 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_opt_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (uncased) +author: John Snow Labs +name: distilbert_base_uncased_opt +date: 2023-06-28 +tags: [distilbert, en, english, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is uncased: it does not make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_opt_en_5.0.0_3.0_1687984673095.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_opt_en_5.0.0_3.0_1687984673095.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_uncased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|247.2 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md new file mode 100644 index 00000000000000..b5608cd91ac12f --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_base_uncased_quantized_en.md @@ -0,0 +1,118 @@ +--- +layout: model +title: DistilBERT base model (uncased) +author: John Snow Labs +name: distilbert_base_uncased_quantized +date: 2023-06-28 +tags: [distilbert, en, english, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model is a distilled version of the [BERT base model](https://huggingface.co/bert-base-cased). It was introduced in [this paper](https://arxiv.org/abs/1910.01108). The code for the distillation process can be found [here](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation). This model is uncased: it does not make a difference between english and English. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_quantized_en_5.0.0_3.0_1687984715653.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_base_uncased_quantized_en_5.0.0_3.0_1687984715653.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +``` +```scala +val embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.distilbert.base.uncased").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_base_uncased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|114.3 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 82.2 | 88.5 | 89.2 | 91.3 | 51.3 | 85.8 | 87.5 | 59.9 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md new file mode 100644 index 00000000000000..5ded15b165dcb2 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_german_cased +date: 2023-06-28 +tags: [distilbert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-german-cased` is a German model orginally trained by HuggingFace. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_de_5.0.0_3.0_1687986069961.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_de_5.0.0_3.0_1687986069961.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_german_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|250.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md new file mode 100644 index 00000000000000..aa7520c32d32f5 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_opt_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_german_cased_opt +date: 2023-06-28 +tags: [distilbert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-german-cased` is a German model orginally trained by HuggingFace. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_opt_de_5.0.0_3.0_1687986127648.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_opt_de_5.0.0_3.0_1687986127648.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_german_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|250.3 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md new file mode 100644 index 00000000000000..57b94449d930ed --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-distilbert_embeddings_distilbert_base_german_cased_quantized_de.md @@ -0,0 +1,149 @@ +--- +layout: model +title: German DistilBERT Embeddings +author: John Snow Labs +name: distilbert_embeddings_distilbert_base_german_cased_quantized +date: 2023-06-28 +tags: [distilbert, embeddings, de, open_source, onnx] +task: Embeddings +language: de +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: DistilBertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained DistilBERT Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `distilbert-base-german-cased` is a German model orginally trained by HuggingFace. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_quantized_de_5.0.0_3.0_1687986166061.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilbert_embeddings_distilbert_base_german_cased_quantized_de_5.0.0_3.0_1687986166061.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["Ich liebe Funken NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = DistilBertEmbeddings.pretrained("distilbert_embeddings_distilbert_base_german_cased","de") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("Ich liebe Funken NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("de.embed.distilbert_base_german_cased").predict("""Ich liebe Funken NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilbert_embeddings_distilbert_base_german_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|de| +|Size:|115.9 MB| +|Case sensitive:|true| \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md new file mode 100644 index 00000000000000..9ef8061b84bf7c --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_en.md @@ -0,0 +1,122 @@ +--- +layout: model +title: RoBERTa base model +author: John Snow Labs +name: roberta_base +date: 2023-06-28 +tags: [en, english, roberta, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in [this paper](https://arxiv.org/abs/1907.11692) and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it makes a difference between english and English. + +RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. + +More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. + +This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_en_5.0.0_3.0_1687951314039.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_en_5.0.0_3.0_1687951314039.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|298.2 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/roberta-base](https://huggingface.co/roberta-base) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md new file mode 100644 index 00000000000000..53d45bb7101929 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_opt_en.md @@ -0,0 +1,122 @@ +--- +layout: model +title: RoBERTa base model +author: John Snow Labs +name: roberta_base_opt +date: 2023-06-28 +tags: [en, english, roberta, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in [this paper](https://arxiv.org/abs/1907.11692) and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it makes a difference between english and English. + +RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. + +More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. + +This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_opt_en_5.0.0_3.0_1687951622206.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_opt_en_5.0.0_3.0_1687951622206.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|298.3 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/roberta-base](https://huggingface.co/roberta-base) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md new file mode 100644 index 00000000000000..2459015ce7c014 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-roberta_base_quantized_en.md @@ -0,0 +1,122 @@ +--- +layout: model +title: RoBERTa base model +author: John Snow Labs +name: roberta_base_quantized +date: 2023-06-28 +tags: [en, english, roberta, embeddings, open_source, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in [this paper](https://arxiv.org/abs/1907.11692) and first released in [this repository](https://github.com/pytorch/fairseq/tree/master/examples/roberta). This model is case-sensitive: it makes a difference between english and English. + +RoBERTa is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it was pretrained on the raw texts only, with no humans labeling them in any way (which is why it can use lots of publicly available data) with an automatic process to generate inputs and labels from those texts. + +More precisely, it was pretrained with the Masked language modeling (MLM) objective. Taking a sentence, the model randomly masks 15% of the words in the input then runs the entire masked sentence through the model and has to predict the masked words. This is different from traditional recurrent neural networks (RNNs) that usually see the words one after the other, or from autoregressive models like GPT which internally mask the future tokens. It allows the model to learn a bidirectional representation of the sentence. + +This way, the model learns an inner representation of the English language that can then be used to extract features useful for downstream tasks: if you have a dataset of labeled sentences, for instance, you can train a standard classifier using the features produced by the BERT model as inputs. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_base_quantized_en_5.0.0_3.0_1687951753623.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_base_quantized_en_5.0.0_3.0_1687951753623.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` + +
+ +{:.model-param} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +``` +```scala +val embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.roberta").predict("""Put your text here.""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_base_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[token, sentence]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|153.6 MB| +|Case sensitive:|true| + +## References + +[https://huggingface.co/roberta-base](https://huggingface.co/roberta-base) + +## Benchmarking + +```bash +Benchmarking + + +When fine-tuned on downstream tasks, this model achieves the following results: + +Glue test results: + +| Task | MNLI | QQP | QNLI | SST-2 | CoLA | STS-B | MRPC | RTE | +|:----:|:----:|:----:|:----:|:-----:|:----:|:-----:|:----:|:----:| +| | 87.6 | 91.9 | 92.8 | 94.8 | 63.6 | 91.2 | 90.2 | 78.7 | + + +``` \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md new file mode 100644 index 00000000000000..81ffc14f7063a1 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: Smaller BERT Embeddings (L-2_H-768_A-12) +author: John Snow Labs +name: small_bert_L2_768 +date: 2023-06-28 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This is one of the smaller BERT models referenced in [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/abs/1908.08962). The smaller BERT models are intended for environments with restricted computational resources. They can be fine-tuned in the same manner as the original BERT models. However, they are most effective in the context of knowledge distillation, where the fine-tuning labels are produced by a larger and more accurate teacher. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_en_5.0.0_3.0_1687953630830.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_en_5.0.0_3.0_1687953630830.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_small_L2_768_embeddings + + I [-0.2451338768005371, 0.40763044357299805, -0.... +love [-0.23793038725852966, -0.07403656840324402, -... +NLP [-0.864113450050354, -0.2902209758758545, 0.54... + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|small_bert_L2_768| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|141.9 MB| +|Case sensitive:|false| + +## References + +The model is imported from https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1 \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md new file mode 100644 index 00000000000000..ebe871e2fc8481 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_opt_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: Smaller BERT Embeddings (L-2_H-768_A-12) +author: John Snow Labs +name: small_bert_L2_768_opt +date: 2023-06-28 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This is one of the smaller BERT models referenced in [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/abs/1908.08962). The smaller BERT models are intended for environments with restricted computational resources. They can be fine-tuned in the same manner as the original BERT models. However, they are most effective in the context of knowledge distillation, where the fine-tuning labels are produced by a larger and more accurate teacher. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_opt_en_5.0.0_3.0_1687953671808.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_opt_en_5.0.0_3.0_1687953671808.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_small_L2_768_embeddings + + I [-0.2451338768005371, 0.40763044357299805, -0.... +love [-0.23793038725852966, -0.07403656840324402, -... +NLP [-0.864113450050354, -0.2902209758758545, 0.54... + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|small_bert_L2_768_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|141.9 MB| +|Case sensitive:|false| + +## References + +The model is imported from https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1 \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md new file mode 100644 index 00000000000000..88d5059afbcd2d --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-28-small_bert_L2_768_quantized_en.md @@ -0,0 +1,136 @@ +--- +layout: model +title: Smaller BERT Embeddings (L-2_H-768_A-12) +author: John Snow Labs +name: small_bert_L2_768_quantized +date: 2023-06-28 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This is one of the smaller BERT models referenced in [Well-Read Students Learn Better: On the Importance of Pre-training Compact Models](https://arxiv.org/abs/1908.08962). The smaller BERT models are intended for environments with restricted computational resources. They can be fine-tuned in the same manner as the original BERT models. However, they are most effective in the context of knowledge distillation, where the fine-tuning labels are produced by a larger and more accurate teacher. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_quantized_en_5.0.0_3.0_1687953706128.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/small_bert_L2_768_quantized_en_5.0.0_3.0_1687953706128.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("small_bert_L2_768", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.small_L2_768').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_small_L2_768_embeddings + + I [-0.2451338768005371, 0.40763044357299805, -0.... +love [-0.23793038725852966, -0.07403656840324402, -... +NLP [-0.864113450050354, -0.2902209758758545, 0.54... + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|small_bert_L2_768_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|97.4 MB| +|Case sensitive:|false| + +## References + +The model is imported from https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1 \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md new file mode 100644 index 00000000000000..d83d5b747e5e33 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_en.md @@ -0,0 +1,137 @@ +--- +layout: model +title: BERT Embeddings (Base Cased) +author: John Snow Labs +name: bert_base_cased +date: 2023-06-29 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model contains a deep bidirectional transformer trained on Wikipedia and the BookCorpus. The details are described in the paper "[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_cased_en_5.0.0_3.0_1688044252396.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_cased_en_5.0.0_3.0_1688044252396.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_base_cased_embeddings + + I [0.43879568576812744, -0.40160006284713745, 0.... + love [0.21737590432167053, -0.3865768313407898, -0.... + NLP [-0.16226479411125183, -0.053727392107248306, ... + + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_cased| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.6 MB| +|Case sensitive:|true| + +## References + +The model is imported from [https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1](https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1) \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md new file mode 100644 index 00000000000000..3a9083db719381 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_opt_en.md @@ -0,0 +1,137 @@ +--- +layout: model +title: BERT Embeddings (Base Cased) +author: John Snow Labs +name: bert_base_cased_opt +date: 2023-06-29 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model contains a deep bidirectional transformer trained on Wikipedia and the BookCorpus. The details are described in the paper "[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_cased_opt_en_5.0.0_3.0_1688044364323.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_cased_opt_en_5.0.0_3.0_1688044364323.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_base_cased_embeddings + + I [0.43879568576812744, -0.40160006284713745, 0.... + love [0.21737590432167053, -0.3865768313407898, -0.... + NLP [-0.16226479411125183, -0.053727392107248306, ... + + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_cased_opt| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|403.7 MB| +|Case sensitive:|true| + +## References + +The model is imported from [https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1](https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1) \ No newline at end of file diff --git a/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md new file mode 100644 index 00000000000000..f477c9e22a3310 --- /dev/null +++ b/docs/_posts/ahmedlone127/2023-06-29-bert_base_cased_quantized_en.md @@ -0,0 +1,137 @@ +--- +layout: model +title: BERT Embeddings (Base Cased) +author: John Snow Labs +name: bert_base_cased_quantized +date: 2023-06-29 +tags: [open_source, embeddings, en, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.0.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BertEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +This model contains a deep bidirectional transformer trained on Wikipedia and the BookCorpus. The details are described in the paper "[BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)". + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bert_base_cased_quantized_en_5.0.0_3.0_1688044431004.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bert_base_cased_quantized_en_5.0.0_3.0_1688044431004.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + +
+ +{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` + +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` + +
+ +{:.h2_title} + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +... +embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \ +.setInputCols("sentence", "token") \ +.setOutputCol("embeddings") +nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, embeddings]) +pipeline_model = nlp_pipeline.fit(spark.createDataFrame([[""]]).toDF("text")) +result = pipeline_model.transform(spark.createDataFrame([['I love NLP']], ["text"])) +``` +```scala +... +val embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") +.setInputCols("sentence", "token") +.setOutputCol("embeddings") +val pipeline = new Pipeline().setStages(Array(document_assembler, sentence_detector, tokenizer, embeddings)) +val data = Seq("I love NLP").toDF("text") +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu + +text = ["I love NLP"] +embeddings_df = nlu.load('en.embed.bert.base_cased').predict(text, output_level='token') +embeddings_df +``` +
+ +## Results + +```bash +Results + + token en_embed_bert_base_cased_embeddings + + I [0.43879568576812744, -0.40160006284713745, 0.... + love [0.21737590432167053, -0.3865768313407898, -0.... + NLP [-0.16226479411125183, -0.053727392107248306, ... + + + +{:.model-param} +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bert_base_cased_quantized| +|Compatibility:|Spark NLP 5.0.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[bert]| +|Language:|en| +|Size:|139.5 MB| +|Case sensitive:|true| + +## References + +The model is imported from [https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1](https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1) \ No newline at end of file