From 30b478fd21dea71908fc674e52d0c710bac36000 Mon Sep 17 00:00:00 2001 From: jsl-models <74001263+jsl-models@users.noreply.github.com> Date: Thu, 14 Nov 2024 01:46:11 +0700 Subject: [PATCH] Add model 2024-11-13-roberta_embeddings_legal_roberta_base_en (#14456) Co-authored-by: gadde5300 --- ...oberta_embeddings_legal_roberta_base_en.md | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 docs/_posts/gadde5300/2024-11-13-roberta_embeddings_legal_roberta_base_en.md diff --git a/docs/_posts/gadde5300/2024-11-13-roberta_embeddings_legal_roberta_base_en.md b/docs/_posts/gadde5300/2024-11-13-roberta_embeddings_legal_roberta_base_en.md new file mode 100644 index 00000000000000..77ef36bb4ee436 --- /dev/null +++ b/docs/_posts/gadde5300/2024-11-13-roberta_embeddings_legal_roberta_base_en.md @@ -0,0 +1,109 @@ +--- +layout: model +title: English Legal RoBERTa Embeddings (CaseLaw, Base, Cased) +author: John Snow Labs +name: roberta_embeddings_legal_roberta_base +date: 2024-11-13 +tags: [roberta, embeddings, en, open_source, tensorflow] +task: Embeddings +language: en +edition: Spark NLP 5.5.0 +spark_version: 3.0 +supported: true +engine: tensorflow +annotator: RoBertaEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained Legal RoBERTa Embeddings model, uploaded to Hugging Face, adapted and imported into Spark NLP. `legal-roberta-base` is a English model orginally trained by `saibo`. + +## Predicted Entities + + + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/roberta_embeddings_legal_roberta_base_en_5.5.0_3.0_1731462634993.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/roberta_embeddings_legal_roberta_base_en_5.5.0_3.0_1731462634993.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python +documentAssembler = DocumentAssembler() \ +.setInputCol("text") \ +.setOutputCol("document") + +tokenizer = Tokenizer() \ +.setInputCols("document") \ +.setOutputCol("token") + +embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \ +.setInputCols(["document", "token"]) \ +.setOutputCol("embeddings") + +pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings]) + +data = spark.createDataFrame([["I love Spark NLP"]]).toDF("text") + +result = pipeline.fit(data).transform(data) +``` +```scala +val documentAssembler = new DocumentAssembler() +.setInputCol("text") +.setOutputCol("document") + +val tokenizer = new Tokenizer() +.setInputCols(Array("document")) +.setOutputCol("token") + +val embeddings = RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") +.setInputCols(Array("document", "token")) +.setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(documentAssembler, tokenizer, embeddings)) + +val data = Seq("I love Spark NLP").toDF("text") + +val result = pipeline.fit(data).transform(data) +``` + +{:.nlu-block} +```python +import nlu +nlu.load("en.embed.legal_roberta_base").predict("""I love Spark NLP""") +``` +
+ +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|roberta_embeddings_legal_roberta_base| +|Compatibility:|Spark NLP 5.5.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[sentence, token]| +|Output Labels:|[embeddings]| +|Language:|en| +|Size:|468.9 MB| +|Case sensitive:|true| + +## Benchmarking + +```bash +- https://huggingface.co/saibo/legal-roberta-base +- https://www.kaggle.com/uspto/patent-litigations +- https://case.law/ +- https://www.kaggle.com/bigquery/patents +- https://www.kaggle.com/sohier/beyond-queries-exploring-the-bigquery-api +``` \ No newline at end of file