From d5abbc0e8bc3883c1ac08e15dcb8504a4ba679c7 Mon Sep 17 00:00:00 2001 From: Maziyar Panahi Date: Fri, 26 May 2023 13:31:55 +0200 Subject: [PATCH] Models hub (#13832) * Update 2023-05-25-distilcamembert_french_legal_fr.md --------- Co-authored-by: Mary-Sci Co-authored-by: Merve Ertas Uslu <67653613+Mary-Sci@users.noreply.github.com> * Update title for 2023-05-25-distilcamembert_french_legal_fr.md (#13831) --------- Co-authored-by: Merve Ertas Uslu <67653613+Mary-Sci@users.noreply.github.com> --- .../2023-05-25-camembert_french_legal_fr.md | 96 +++++++++++++++++++ ...3-05-25-distilcamembert_french_legal_fr.md | 96 +++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 docs/_posts/Mary-Sci/2023-05-25-camembert_french_legal_fr.md create mode 100644 docs/_posts/Mary-Sci/2023-05-25-distilcamembert_french_legal_fr.md diff --git a/docs/_posts/Mary-Sci/2023-05-25-camembert_french_legal_fr.md b/docs/_posts/Mary-Sci/2023-05-25-camembert_french_legal_fr.md new file mode 100644 index 00000000000000..6fa3398e68ced2 --- /dev/null +++ b/docs/_posts/Mary-Sci/2023-05-25-camembert_french_legal_fr.md @@ -0,0 +1,96 @@ +--- +layout: model +title: French Legal CamemBert Embeddings Model +author: John Snow Labs +name: camembert_french_legal +date: 2023-05-25 +tags: [open_source, camembert_embeddings, camembertformaskedlm, fr, tensorflow] +task: Embeddings +language: fr +edition: Spark NLP 4.4.2 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: CamemBertEmbeddings +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained CamemBertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-camembert` is a French model originally trained by `maastrichtlawtech`. + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/camembert_french_legal_fr_4.4.2_3.0_1685035847575.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/camembert_french_legal_fr_4.4.2_3.0_1685035847575.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCols(["text"]) \ + .setOutputCols("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = CamemBertEmbeddings.pretrained("camembert_french_legal","fr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["J'adore Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = CamemBertEmbeddings.pretrained("camembert_french_legal","fr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("J'adore Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|camembert_french_legal| +|Compatibility:|Spark NLP 4.4.2+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|fr| +|Size:|415.8 MB| +|Case sensitive:|true| + +## References + +https://huggingface.co/maastrichtlawtech/legal-camembert diff --git a/docs/_posts/Mary-Sci/2023-05-25-distilcamembert_french_legal_fr.md b/docs/_posts/Mary-Sci/2023-05-25-distilcamembert_french_legal_fr.md new file mode 100644 index 00000000000000..02fd758632ac5f --- /dev/null +++ b/docs/_posts/Mary-Sci/2023-05-25-distilcamembert_french_legal_fr.md @@ -0,0 +1,96 @@ +--- +layout: model +title: French Legal DistilCamemBert Embeddings Model +author: John Snow Labs +name: distilcamembert_french_legal +date: 2023-05-25 +tags: [open_source, camembert_embeddings, camembertformaskedlm, fr, tensorflow] +task: Embeddings +language: fr +edition: Spark NLP 4.4.2 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: CamemBertEmbeddings +article_header: +type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained CamemBertEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. `legal-distilcamembert` is a French model originally trained by `maastrichtlawtech`. + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/distilcamembert_french_legal_fr_4.4.2_3.0_1685031800112.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/distilcamembert_french_legal_fr_4.4.2_3.0_1685031800112.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} + +```python +documentAssembler = DocumentAssembler() \ + .setInputCols(["text"]) \ + .setOutputCols("document") + +tokenizer = Tokenizer() \ + .setInputCols("document") \ + .setOutputCol("token") + +embeddings = CamemBertEmbeddings.pretrained("distilcamembert_french_legal","fr") \ + .setInputCols(["document", "token"]) \ + .setOutputCol("embeddings") \ + .setCaseSensitive(True) + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["J'adore Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() + .setInputCols(Array("text")) + .setOutputCols(Array("document")) + +val tokenizer = new Tokenizer() + .setInputCols("document") + .setOutputCol("token") + +val embeddings = CamemBertEmbeddings.pretrained("distilcamembert_french_legal","fr") + .setInputCols(Array("document", "token")) + .setOutputCol("embeddings") + .setCaseSensitive(True) + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("J'adore Spark NLP").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|distilcamembert_french_legal| +|Compatibility:|Spark NLP 4.4.2+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|fr| +|Size:|256.1 MB| +|Case sensitive:|true| + +## References + +https://huggingface.co/maastrichtlawtech/legal-distilcamembert