From 9ba20c92da162f1c0b8317d426acf0bbeb4be169 Mon Sep 17 00:00:00 2001 From: Cabir40 Date: Mon, 21 Oct 2024 19:32:09 +0700 Subject: [PATCH 1/3] Add model 2024-10-21-bge_medembed_small_v0_1_en --- .../2024-10-21-bge_medembed_small_v0_1_en.md | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 docs/_posts/Cabir40/2024-10-21-bge_medembed_small_v0_1_en.md diff --git a/docs/_posts/Cabir40/2024-10-21-bge_medembed_small_v0_1_en.md b/docs/_posts/Cabir40/2024-10-21-bge_medembed_small_v0_1_en.md new file mode 100644 index 00000000000000..b46280a80ba64c --- /dev/null +++ b/docs/_posts/Cabir40/2024-10-21-bge_medembed_small_v0_1_en.md @@ -0,0 +1,101 @@ +--- +layout: model +title: English bge_medembed_small_v0_1 BGEEmbeddings from abhinand +author: John Snow Labs +name: bge_medembed_small_v0_1 +date: 2024-10-21 +tags: [embedding, en, open_source, bge, medical, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.5.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BGEEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BGEEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. +`bge_medembed_small_v0_1` is a English model originally trained by abhinand + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bge_medembed_small_v0_1_en_5.5.0_3.0_1729513920928.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bge_medembed_small_v0_1_en_5.5.0_3.0_1729513920928.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +embeddings = BGEEmbeddings.pretrained("bge_medembed_small_v0_1","en")\ + .setInputCols(["document"])\ + .setOutputCol("embeddings") + +pipeline = Pipeline( + stages = [ + document_assembler, + embeddings +]) + +data = spark.createDataFrame([["I love spark-nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val embeddings = BGEEmbeddings.pretrained("bge_medembed_small_v0_1","en") + .setInputCols(Array("document")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, embeddings)) + +val data = Seq("I love spark-nlp").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash + ++----------------------------------------------------------------------------------------------------+ +| bge_embedding| ++----------------------------------------------------------------------------------------------------+ +|[{sentence_embeddings, 0, 15, I love spark-nlp, {sentence -> 0}, [-0.07673764, -0.04207312, 0.026...| ++----------------------------------------------------------------------------------------------------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bge_medembed_small_v0_1| +|Compatibility:|Spark NLP 5.5.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[document]| +|Output Labels:|[bge]| +|Language:|en| +|Size:|116.4 MB| \ No newline at end of file From 6c6fef205896bc7142d0516b7111de568304e0da Mon Sep 17 00:00:00 2001 From: Cabir40 Date: Mon, 21 Oct 2024 19:55:03 +0700 Subject: [PATCH 2/3] Add model 2024-10-21-bge_medembed_large_v0_1_en --- .../2024-10-21-bge_medembed_large_v0_1_en.md | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 docs/_posts/Cabir40/2024-10-21-bge_medembed_large_v0_1_en.md diff --git a/docs/_posts/Cabir40/2024-10-21-bge_medembed_large_v0_1_en.md b/docs/_posts/Cabir40/2024-10-21-bge_medembed_large_v0_1_en.md new file mode 100644 index 00000000000000..ef752830de8a8f --- /dev/null +++ b/docs/_posts/Cabir40/2024-10-21-bge_medembed_large_v0_1_en.md @@ -0,0 +1,101 @@ +--- +layout: model +title: English bge_medembed_large_v0_1 BGEEmbeddings from abhinand +author: John Snow Labs +name: bge_medembed_large_v0_1 +date: 2024-10-21 +tags: [embedding, en, open_source, bge, medical, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.5.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BGEEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BGEEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. +`bge_medembed_large_v0_1` is a English model originally trained by abhinand + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bge_medembed_large_v0_1_en_5.5.0_3.0_1729515260623.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bge_medembed_large_v0_1_en_5.5.0_3.0_1729515260623.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +embeddings = BGEEmbeddings.pretrained("bge_medembed_large_v0_1","en")\ + .setInputCols(["document"])\ + .setOutputCol("embeddings") + +pipeline = Pipeline( + stages = [ + document_assembler, + embeddings +]) + +data = spark.createDataFrame([["I love spark-nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val embeddings = BGEEmbeddings.pretrained("bge_medembed_large_v0_1","en") + .setInputCols(Array("document")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, embeddings)) + +val data = Seq("I love spark-nlp").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash + ++----------------------------------------------------------------------------------------------------+ +| bge_embedding| ++----------------------------------------------------------------------------------------------------+ +|[{sentence_embeddings, 0, 15, I love spark-nlp, {sentence -> 0}, [-0.018065551, -0.032784615, 0.0...| ++----------------------------------------------------------------------------------------------------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bge_medembed_large_v0_1| +|Compatibility:|Spark NLP 5.5.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[document]| +|Output Labels:|[bge]| +|Language:|en| +|Size:|1.2 GB| \ No newline at end of file From 576bc8ae4d251f2f531336e8e8ed68a079433141 Mon Sep 17 00:00:00 2001 From: Cabir40 Date: Mon, 21 Oct 2024 19:57:27 +0700 Subject: [PATCH 3/3] Add model 2024-10-21-bge_medembed_base_v0_1_en --- .../2024-10-21-bge_medembed_base_v0_1_en.md | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 docs/_posts/Cabir40/2024-10-21-bge_medembed_base_v0_1_en.md diff --git a/docs/_posts/Cabir40/2024-10-21-bge_medembed_base_v0_1_en.md b/docs/_posts/Cabir40/2024-10-21-bge_medembed_base_v0_1_en.md new file mode 100644 index 00000000000000..f21ba93b82dd9c --- /dev/null +++ b/docs/_posts/Cabir40/2024-10-21-bge_medembed_base_v0_1_en.md @@ -0,0 +1,101 @@ +--- +layout: model +title: English bge_medembed_base_v0_1 BGEEmbeddings from abhinand +author: John Snow Labs +name: bge_medembed_base_v0_1 +date: 2024-10-21 +tags: [embedding, en, open_source, bge, medical, onnx] +task: Embeddings +language: en +edition: Spark NLP 5.5.0 +spark_version: 3.0 +supported: true +engine: onnx +annotator: BGEEmbeddings +article_header: + type: cover +use_language_switcher: "Python-Scala-Java" +--- + +## Description + +Pretrained BGEEmbeddings model, adapted from Hugging Face and curated to provide scalability and production-readiness using Spark NLP. +`bge_medembed_base_v0_1` is a English model originally trained by abhinand + +{:.btn-box} + + +[Download](https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/models/bge_medembed_base_v0_1_en_5.5.0_3.0_1729515433167.zip){:.button.button-orange.button-orange-trans.arr.button-icon} +[Copy S3 URI](s3://auxdata.johnsnowlabs.com/public/models/bge_medembed_base_v0_1_en_5.5.0_3.0_1729515433167.zip){:.button.button-orange.button-orange-trans.button-icon.button-copy-s3} + +## How to use + + + +
+{% include programmingLanguageSelectScalaPythonNLU.html %} +```python + +document_assembler = DocumentAssembler()\ + .setInputCol("text")\ + .setOutputCol("document") + +embeddings = BGEEmbeddings.pretrained("bge_medembed_base_v0_1","en")\ + .setInputCols(["document"])\ + .setOutputCol("embeddings") + +pipeline = Pipeline( + stages = [ + document_assembler, + embeddings +]) + +data = spark.createDataFrame([["I love spark-nlp"]]).toDF("text") + +result = pipeline.fit(data).transform(data) + +``` +```scala + +val document_assembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val embeddings = BGEEmbeddings.pretrained("bge_medembed_base_v0_1","en") + .setInputCols(Array("document")) + .setOutputCol("embeddings") + +val pipeline = new Pipeline().setStages(Array(document_assembler, embeddings)) + +val data = Seq("I love spark-nlp").toDS.toDF("text") + +val result = pipeline.fit(data).transform(data) + +``` +
+ +## Results + +```bash + ++----------------------------------------------------------------------------------------------------+ +| bge_embedding| ++----------------------------------------------------------------------------------------------------+ +|[{sentence_embeddings, 0, 15, I love spark-nlp, {sentence -> 0}, [-0.018065551, -0.032784615, 0.0...| ++----------------------------------------------------------------------------------------------------+ + +``` + +{:.model-param} +## Model Information + +{:.table-model} +|---|---| +|Model Name:|bge_medembed_base_v0_1| +|Compatibility:|Spark NLP 5.5.0+| +|License:|Open Source| +|Edition:|Official| +|Input Labels:|[document]| +|Output Labels:|[bge]| +|Language:|en| +|Size:|389.7 MB| \ No newline at end of file