From 1ccf94399f2dbeb754fcac17a779ba47583fe64f Mon Sep 17 00:00:00 2001
From: Prabod Rathnayaka <prabod@rathnayaka.me>
Date: Mon, 11 Mar 2024 04:41:13 +0000
Subject: [PATCH] added cpm python api and tests

---
 python/sparknlp/annotator/seq2seq/__init__.py |   1 +
 .../annotator/seq2seq/cpm_transformer.py      | 321 ++++++++++++++++++
 python/sparknlp/internal/__init__.py          |   5 +
 .../annotator/seq2seq/cpm_transformer_test.py |  46 +++
 .../annotators/seq2seq/CPMTransformer.scala   |  11 -
 5 files changed, 373 insertions(+), 11 deletions(-)
 create mode 100644 python/sparknlp/annotator/seq2seq/cpm_transformer.py
 create mode 100644 python/test/annotator/seq2seq/cpm_transformer_test.py

diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py
index 5abf7be0d12dfb..c2e60667349916 100644
--- a/python/sparknlp/annotator/seq2seq/__init__.py
+++ b/python/sparknlp/annotator/seq2seq/__init__.py
@@ -19,3 +19,4 @@
 from sparknlp.annotator.seq2seq.bart_transformer import *
 from sparknlp.annotator.seq2seq.llama2_transformer import *
 from sparknlp.annotator.seq2seq.m2m100_transformer import *
+from sparknlp.annotator.seq2seq.cpm_transformer import *
diff --git a/python/sparknlp/annotator/seq2seq/cpm_transformer.py b/python/sparknlp/annotator/seq2seq/cpm_transformer.py
new file mode 100644
index 00000000000000..a3e29f421aff31
--- /dev/null
+++ b/python/sparknlp/annotator/seq2seq/cpm_transformer.py
@@ -0,0 +1,321 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the CPMTransformer."""
+
+from sparknlp.common import *
+
+
+class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
+    """MiniCPM: Unveiling the Potential of End-side Large Language Models
+
+    MiniCPM is a series of edge-side large language models, with the base model, MiniCPM-2B,
+    having 2.4B non-embedding parameters. It ranks closely with Mistral-7B on comprehensive
+    benchmarks (with better performance in Chinese, mathematics, and coding abilities), surpassing
+    models like Llama2-13B, MPT-30B, and Falcon-40B. On the MTBench benchmark, which is closest to
+    user experience, MiniCPM-2B also outperforms many representative open-source models such as
+    Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1, and Zephyr-7B-alpha.
+
+    After DPO, MiniCPM outperforms Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1,
+    Zephyr-7B-alpha, etc. on MTBench.
+
+    MiniCPM-V, based on MiniCPM-2B, achieves the best overall performance among multimodel models
+    of the same scale, surpassing existing multimodal large models built on Phi-2 and achieving
+    performance comparable to or even better than 9.6B Qwen-VL-Chat on some tasks.
+
+    MiniCPM can be deployed and infer on smartphones, and the speed of streaming output is
+    relatively higher than the verbal speed of human.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> cpm = CPMTransformer.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("generation")
+
+
+    The default model is ``"llam2-7b"``, if no name is provided. For available
+    pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?q=cpm>`__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    minOutputLength
+        Minimum length of the sequence to be generated, by default 0
+    maxOutputLength
+        Maximum length of output text, by default 20
+    doSample
+        Whether or not to use sampling; use greedy decoding otherwise, by default False
+    temperature
+        The value used to module the next token probabilities, by default 1.0
+    topK
+        The number of highest probability vocabulary tokens to keep for
+        top-k-filtering, by default 50
+    topP
+        Top cumulative probability for vocabulary tokens, by default 1.0
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+    repetitionPenalty
+        The parameter for repetition penalty, 1.0 means no penalty. , by default
+        1.0
+    noRepeatNgramSize
+        If set to int > 0, all ngrams of that size can only occur once, by
+        default 0
+    ignoreTokenIds
+        A list of token ids which are ignored in the decoder's output, by
+        default []
+
+    Notes
+    -----
+    This is a very computationally expensive module especially on larger
+    sequence. The use of an accelerator such as GPU is recommended.
+
+    References
+    ----------
+    - `MiniCPM: Unveiling the Potential of End-side Large Language Models
+      <https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20>`
+    - https://github.com/OpenBMB/MiniCPM
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("documents")
+    >>> cpm = CPMTransformer.pretrained("llama_2_7b_chat_hf_int4") \\
+    ...     .setInputCols(["documents"]) \\
+    ...     .setMaxOutputLength(50) \\
+    ...     .setOutputCol("generation")
+    >>> pipeline = Pipeline().setStages([documentAssembler, cpm])
+    >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("summaries.generation").show(truncate=False)
+    +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |result                                                                                                                                                                                                |
+    +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |[My name is Leonardo. I am a student at the University of California, Los Angeles. I have a passion for writing and learning about different cultures. I enjoy playing basketball and watching movies]|
+    -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    """
+
+    name = "CPMTransformer"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
+                            typeConverter=TypeConverters.toInt)
+
+    maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
+                            typeConverter=TypeConverters.toInt)
+
+    doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
+                     typeConverter=TypeConverters.toBoolean)
+
+    temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
+                        typeConverter=TypeConverters.toFloat)
+
+    topK = Param(Params._dummy(), "topK",
+                 "The number of highest probability vocabulary tokens to keep for top-k-filtering",
+                 typeConverter=TypeConverters.toInt)
+
+    topP = Param(Params._dummy(), "topP",
+                 "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
+                 typeConverter=TypeConverters.toFloat)
+
+    repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
+                              "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
+                              typeConverter=TypeConverters.toFloat)
+
+    noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
+                              "If set to int > 0, all ngrams of that size can only occur once",
+                              typeConverter=TypeConverters.toInt)
+
+    ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
+                           "A list of token ids which are ignored in the decoder's output",
+                           typeConverter=TypeConverters.toListInt)
+
+    def setIgnoreTokenIds(self, value):
+        """A list of token ids which are ignored in the decoder's output.
+
+        Parameters
+        ----------
+        value : List[int]
+            The words to be filtered out
+        """
+        return self._set(ignoreTokenIds=value)
+
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+
+    def setMinOutputLength(self, value):
+        """Sets minimum length of the sequence to be generated.
+
+        Parameters
+        ----------
+        value : int
+            Minimum length of the sequence to be generated
+        """
+        return self._set(minOutputLength=value)
+
+    def setMaxOutputLength(self, value):
+        """Sets maximum length of output text.
+
+        Parameters
+        ----------
+        value : int
+            Maximum length of output text
+        """
+        return self._set(maxOutputLength=value)
+
+    def setDoSample(self, value):
+        """Sets whether or not to use sampling, use greedy decoding otherwise.
+
+        Parameters
+        ----------
+        value : bool
+            Whether or not to use sampling; use greedy decoding otherwise
+        """
+        return self._set(doSample=value)
+
+    def setTemperature(self, value):
+        """Sets the value used to module the next token probabilities.
+
+        Parameters
+        ----------
+        value : float
+            The value used to module the next token probabilities
+        """
+        return self._set(temperature=value)
+
+    def setTopK(self, value):
+        """Sets the number of highest probability vocabulary tokens to keep for
+        top-k-filtering.
+
+        Parameters
+        ----------
+        value : int
+            Number of highest probability vocabulary tokens to keep
+        """
+        return self._set(topK=value)
+
+    def setTopP(self, value):
+        """Sets the top cumulative probability for vocabulary tokens.
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+
+        Parameters
+        ----------
+        value : float
+            Cumulative probability for vocabulary tokens
+        """
+        return self._set(topP=value)
+
+    def setRepetitionPenalty(self, value):
+        """Sets the parameter for repetition penalty. 1.0 means no penalty.
+
+        Parameters
+        ----------
+        value : float
+            The repetition penalty
+
+        References
+        ----------
+        See `Ctrl: A Conditional Transformer Language Model For Controllable
+        Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+        """
+        return self._set(repetitionPenalty=value)
+
+    def setNoRepeatNgramSize(self, value):
+        """Sets size of n-grams that can only occur once.
+
+        If set to int > 0, all ngrams of that size can only occur once.
+
+        Parameters
+        ----------
+        value : int
+            N-gram size can only occur once
+        """
+        return self._set(noRepeatNgramSize=value)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.CPMTransformer", java_model=None):
+        super(CPMTransformer, self).__init__(classname=classname, java_model=java_model)
+        self._setDefault(minOutputLength=0, maxOutputLength=50, doSample=False, temperature=0.8, topK=100, topP=0.8,
+            repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        CPMTransformer
+            The restored model
+        """
+        from sparknlp.internal import _CPMLoader
+        jModel = _CPMLoader(folder, spark_session._jsparkSession)._java_obj
+        return CPMTransformer(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="llama_2_7b_chat_hf_int4", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "llama_2_7b_chat_hf_int4"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        CPMTransformer
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(CPMTransformer, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index c1aabeeb36aec0..af29c549735a91 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -110,6 +110,11 @@ def __init__(self, path, jspark):
                                                path,
                                                jspark)
 
+class _CPMLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_CPMLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.seq2seq.CPMTransformer.loadSavedModel", path, jspark)
+
 
 class _DistilBertLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
diff --git a/python/test/annotator/seq2seq/cpm_transformer_test.py b/python/test/annotator/seq2seq/cpm_transformer_test.py
new file mode 100644
index 00000000000000..810505cb82a124
--- /dev/null
+++ b/python/test/annotator/seq2seq/cpm_transformer_test.py
@@ -0,0 +1,46 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class CPMTransformerTextGenerationTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        data = self.spark.createDataFrame([
+            [1, """Leonardo Da Vinci invented the microscope?""".strip().replace("\n", " ")]]).toDF("id", "text")
+
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("documents")
+
+        cpm = CPMTransformer \
+            .pretrained() \
+            .setMaxOutputLength(50) \
+            .setDoSample(False) \
+            .setInputCols(["documents"]) \
+            .setOutputCol("generation")
+
+        pipeline = Pipeline().setStages([document_assembler, cpm])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("generation.result").show(truncate=False)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala
index ff6ef4d6d23a5e..ab039cf00f48d4 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala
@@ -77,17 +77,6 @@ import org.json4s.jackson.JsonMethods._
   *   - [[https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20 MiniCPM: Unveiling the Potential of End-side Large Language Models]]
   *   - [[https://github.com/OpenBMB/MiniCPM]]
   *
-  * '''Paper Abstract:'''
-  *
-  * ''In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned
-  * large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our
-  * fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models
-  * outperform open-source chat models on most benchmarks we tested, and based on our human
-  * evaluations for helpfulness and safety, may be a suitable substitute for closed-source models.
-  * We provide a detailed description of our approach to fine-tuning and safety improvements of
-  * Llama 2-Chat in order to enable the community to build on our work and contribute to the
-  * responsible development of LLMs.''
-  *
   * '''Note:'''
   *
   * This is a very computationally expensive module especially on larger sequence. The use of an