added Nomic python api and tests

JohnSnowLabs · Mar 27, 2024 · fe5537a · fe5537a
1 parent 3d52a7c
commit fe5537a
Show file tree

Hide file tree

Showing 6 changed files with 255 additions and 19 deletions.
diff --git a/python/sparknlp/annotator/embeddings/__init__.py b/python/sparknlp/annotator/embeddings/__init__.py
@@ -36,3 +36,4 @@
 from sparknlp.annotator.embeddings.xlm_roberta_sentence_embeddings import *
 from sparknlp.annotator.embeddings.xlnet_embeddings import *
 from sparknlp.annotator.embeddings.bge_embeddings import *
+from sparknlp.annotator.embeddings.nomic_embeddings import *
diff --git a/python/sparknlp/annotator/embeddings/nomic_embeddings.py b/python/sparknlp/annotator/embeddings/nomic_embeddings.py
@@ -0,0 +1,181 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for E5Embeddings."""
+
+from sparknlp.common import *
+
+
+class NomicEmbeddings(AnnotatorModel, HasEmbeddingsProperties, HasCaseSensitiveProperties, HasStorageRef,
+                      HasBatchedAnnotate, HasMaxSentenceLengthLimit):
+    """Sentence embeddings using NomicEmbeddings.
+
+    nomic-embed-text-v1 is 8192 context length text encoder that surpasses OpenAI
+    text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> embeddings = NomicEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("nomic_embeddings")
+
+
+    The default model is ``"nomic_small"``, if no name is provided.
+
+    For available pretrained models please see the
+    `Models Hub <https://sparknlp.org/models?q=Nomic>`__.
+
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``            ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+
+    Parameters
+    ----------
+    batchSize
+        Size of every batch , by default 8
+    dimension
+        Number of embedding dimensions, by default 768
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default False
+    maxSentenceLength
+        Max sentence length to process, by default 512
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+
+    References
+    ----------
+    `Text Embeddings by Weakly-Supervised Contrastive Pre-training <https://arxiv.org/pdf/2212.03533>`__
+
+    https://github.com/microsoft/unilm/tree/master/nomic
+
+    **Paper abstract**
+
+    *This technical report describes the training
+    of nomic-embed-text-v1, the first fully reproducible,
+    open-source, open-weights, opendata, 8192 context length
+    English text embedding model that outperforms both OpenAI
+    Ada-002 and OpenAI text-embedding-3-small
+    on short and long-context tasks. We release
+    the training code and model weights under
+    an Apache 2 license. In contrast with other
+    open-source models, we release a training data
+    loader with 235 million curated text pairs that
+    allows for the full replication of nomic-embedtext-v1.
+    You can find code and data to replicate the
+    model at https://github.com/nomicai/contrastors.*
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> embeddings = NomicEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("nomic_embeddings")
+    >>> embeddingsFinisher = EmbeddingsFinisher() \\
+    ...     .setInputCols(["nomic_embeddings"]) \\
+    ...     .setOutputCols("finished_embeddings") \\
+    ...     .setOutputAsVector(True)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     embeddings,
+    ...     embeddingsFinisher
+    ... ])
+    >>> data = spark.createDataFrame([["query: how much protein should a female eat",
+    ... "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day." + \
+    ... "But, as you can see from this chart, you'll need to increase that if you're expecting or training for a" + \
+    ... "marathon. Check out the chart below to see how much protein you should be eating each day.",
+    ... ]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.selectExpr("explode(finished_embeddings) as result").show(5, 80)
+    +--------------------------------------------------------------------------------+
+    |                                                                          result|
+    +--------------------------------------------------------------------------------+
+    |[[8.0190285E-4, -0.005974853, -0.072875895, 0.007944068, 0.026059335, -0.0080...|
+    |[[0.050514214, 0.010061974, -0.04340176, -0.020937217, 0.05170225, 0.01157857...|
+    +--------------------------------------------------------------------------------+
+    """
+
+    name = "NomicEmbeddings"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.embeddings.NomicEmbeddings", java_model=None):
+        super(NomicEmbeddings, self).__init__(classname=classname, java_model=java_model)
+        self._setDefault(dimension=768, batchSize=8, maxSentenceLength=512, caseSensitive=False, )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        NomicEmbeddings
+            The restored model
+        """
+        from sparknlp.internal import _NomicLoader
+        jModel = _NomicLoader(folder, spark_session._jsparkSession)._java_obj
+        return NomicEmbeddings(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="nomic_small", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "nomic_small"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        NomicEmbeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(NomicEmbeddings, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
@@ -251,6 +251,9 @@ def __init__(self, path, jspark, useCache):
         super(_BartLoader, self).__init__(
             "com.johnsnowlabs.nlp.annotators.seq2seq.BartTransformer.loadSavedModel", path, jspark, useCache)
 
+class _NomicLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_NomicLoader, self).__init__("com.johnsnowlabs.nlp.embeddings.NomicEmbeddings.loadSavedModel", path, jspark)
 
 class _USELoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark, loadsp):

diff --git a/python/test/annotator/embeddings/nomic_embeddings_test.py b/python/test/annotator/embeddings/nomic_embeddings_test.py
@@ -0,0 +1,56 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import os
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class NomicEmbeddingsTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.tested_annotator = NomicEmbeddings \
+            .pretrained() \
+            .setInputCols(["documents"]) \
+            .setOutputCol("nomic")
+
+    def runTest(self):
+        data = self.spark.createDataFrame([
+            [1, "query: how much protein should a female eat"],
+            [2, "query: summit define"],
+            [3, "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 "
+                "is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're "
+                "expecting or training for a marathon. Check out the chart below to see how much protein you should "
+                "be eating each day.", ],
+            [4, "passage: Definition of summit for English Language Learners. : 1  the highest point of a mountain :"
+                " the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the "
+                "leaders of two or more governments."]
+        ]).toDF("id", "text")
+
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("documents")
+
+        nomic = self.tested_annotator
+
+        pipeline = Pipeline().setStages([document_assembler, nomic])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("nomic.embeddings").show(truncate=False)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddings.scala
@@ -38,9 +38,8 @@ import org.slf4j.{Logger, LoggerFactory}
 
 /** Sentence embeddings using NomicEmbeddings.
   *
-  * NomicEmbeddings, an instruction-finetuned text embedding model that can generate text
-  * embeddings tailored to any task (e.g., classification, retrieval, clustering, text evaluation,
-  * etc.)
+  * nomic-embed-text-v1 is 8192 context length text encoder that surpasses OpenAI
+  * text-embedding-ada-002 and text-embedding-3-small performance on short and long context tasks.
   *
   * Pretrained models can be loaded with `pretrained` of the companion object:
   * {{{
@@ -58,23 +57,19 @@ import org.slf4j.{Logger, LoggerFactory}
   *
   * '''Sources''' :
   *
-  * [[https://arxiv.org/pdf/2212.03533 Text Embeddings by Weakly-Supervised Contrastive Pre-training]]
+  * [[https://static.nomic.ai/reports/2024_Nomic_Embed_Text_Technical_Report.pdf Nomic Embed: Training a Reproducible Long Context Text Embedder]]
   *
-  * [[https://github.com/microsoft/unilm/tree/master/nomic NomicEmbeddings Github Repository]]
+  * [[https://github.com/nomicai/contrastors NomicEmbeddings Github Repository]]
   *
   * ''' Paper abstract '''
   *
-  * ''This paper presents NomicEmbeddings, a family of state-of-the-art text embeddings that
-  * transfer well to a wide range of tasks. The model is trained in a contrastive manner with weak
-  * supervision signals from our curated large-scale text pair dataset (called CCPairs).
-  * NomicEmbeddings can be readily used as a general-purpose embedding model for any tasks
-  * requiring a single-vector representation of texts such as retrieval, clustering, and
-  * classification, achieving strong performance in both zero-shot and fine-tuned settings. We
-  * conduct extensive evaluations on 56 datasets from the BEIR and MTEB benchmarks. For zero-shot
-  * settings, NomicEmbeddings is the first model that outperforms the strong BM25 baseline on the
-  * BEIR retrieval benchmark without using any labeled data. When fine-tuned, NomicEmbeddings
-  * obtains the best results on the MTEB benchmark, beating existing embedding models with 40×
-  * more parameters.''
+  * ''This technical report describes the training of nomic-embed-text-v1, the first fully
+  * reproducible, open-source, open-weights, opendata, 8192 context length English text embedding
+  * model that outperforms both OpenAI Ada-002 and OpenAI text-embedding-3-small on short and
+  * long-context tasks. We release the training code and model weights under an Apache 2 license.
+  * In contrast with other open-source models, we release a training data loader with 235 million
+  * curated text pairs that allows for the full replication of nomic-embedtext-v1. You can find
+  * code and data to replicate the model at https://github.com/nomicai/contrastors.''
   *
   * ==Example==
   * {{{
@@ -202,8 +197,8 @@ class NomicEmbeddings(override val uid: String)
   /** @group setParam */
   def setMaxSentenceLength(value: Int): this.type = {
     require(
-      value <= 512,
-      "NomicEmbeddings models do not support sequences longer than 512 because of trainable positional embeddings.")
+      value <= 8192,
+      "NomicEmbeddings models do not support sequences longer than 8192 because of trainable positional embeddings.")
     require(value >= 1, "The maxSentenceLength must be at least 1")
     set(maxSentenceLength, value)
     this

diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/NomicEmbeddingsTestSpec.scala
@@ -19,7 +19,7 @@ package com.johnsnowlabs.nlp.embeddings
 import com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel
 import com.johnsnowlabs.nlp.base.DocumentAssembler
 import com.johnsnowlabs.nlp.util.io.ResourceHelper
-import com.johnsnowlabs.tags.{SlowTest, SlowTest}
+import com.johnsnowlabs.tags.SlowTest
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.sql.functions.{col, size}
 import org.scalatest.flatspec.AnyFlatSpec