From 1ccf94399f2dbeb754fcac17a779ba47583fe64f Mon Sep 17 00:00:00 2001 From: Prabod Rathnayaka Date: Mon, 11 Mar 2024 04:41:13 +0000 Subject: [PATCH] added cpm python api and tests --- python/sparknlp/annotator/seq2seq/__init__.py | 1 + .../annotator/seq2seq/cpm_transformer.py | 321 ++++++++++++++++++ python/sparknlp/internal/__init__.py | 5 + .../annotator/seq2seq/cpm_transformer_test.py | 46 +++ .../annotators/seq2seq/CPMTransformer.scala | 11 - 5 files changed, 373 insertions(+), 11 deletions(-) create mode 100644 python/sparknlp/annotator/seq2seq/cpm_transformer.py create mode 100644 python/test/annotator/seq2seq/cpm_transformer_test.py diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py index 5abf7be0d12dfb..c2e60667349916 100644 --- a/python/sparknlp/annotator/seq2seq/__init__.py +++ b/python/sparknlp/annotator/seq2seq/__init__.py @@ -19,3 +19,4 @@ from sparknlp.annotator.seq2seq.bart_transformer import * from sparknlp.annotator.seq2seq.llama2_transformer import * from sparknlp.annotator.seq2seq.m2m100_transformer import * +from sparknlp.annotator.seq2seq.cpm_transformer import * diff --git a/python/sparknlp/annotator/seq2seq/cpm_transformer.py b/python/sparknlp/annotator/seq2seq/cpm_transformer.py new file mode 100644 index 00000000000000..a3e29f421aff31 --- /dev/null +++ b/python/sparknlp/annotator/seq2seq/cpm_transformer.py @@ -0,0 +1,321 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for the CPMTransformer.""" + +from sparknlp.common import * + + +class CPMTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine): + """MiniCPM: Unveiling the Potential of End-side Large Language Models + + MiniCPM is a series of edge-side large language models, with the base model, MiniCPM-2B, + having 2.4B non-embedding parameters. It ranks closely with Mistral-7B on comprehensive + benchmarks (with better performance in Chinese, mathematics, and coding abilities), surpassing + models like Llama2-13B, MPT-30B, and Falcon-40B. On the MTBench benchmark, which is closest to + user experience, MiniCPM-2B also outperforms many representative open-source models such as + Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1, and Zephyr-7B-alpha. + + After DPO, MiniCPM outperforms Llama2-70B-Chat, Vicuna-33B, Mistral-7B-Instruct-v0.1, + Zephyr-7B-alpha, etc. on MTBench. + + MiniCPM-V, based on MiniCPM-2B, achieves the best overall performance among multimodel models + of the same scale, surpassing existing multimodal large models built on Phi-2 and achieving + performance comparable to or even better than 9.6B Qwen-VL-Chat on some tasks. + + MiniCPM can be deployed and infer on smartphones, and the speed of streaming output is + relatively higher than the verbal speed of human. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> cpm = CPMTransformer.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("generation") + + + The default model is ``"llam2-7b"``, if no name is provided. For available + pretrained models please see the `Models Hub + `__. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT`` ``DOCUMENT`` + ====================== ====================== + + Parameters + ---------- + configProtoBytes + ConfigProto from tensorflow, serialized into byte array. + minOutputLength + Minimum length of the sequence to be generated, by default 0 + maxOutputLength + Maximum length of output text, by default 20 + doSample + Whether or not to use sampling; use greedy decoding otherwise, by default False + temperature + The value used to module the next token probabilities, by default 1.0 + topK + The number of highest probability vocabulary tokens to keep for + top-k-filtering, by default 50 + topP + Top cumulative probability for vocabulary tokens, by default 1.0 + + If set to float < 1, only the most probable tokens with probabilities + that add up to ``topP`` or higher are kept for generation. + repetitionPenalty + The parameter for repetition penalty, 1.0 means no penalty. , by default + 1.0 + noRepeatNgramSize + If set to int > 0, all ngrams of that size can only occur once, by + default 0 + ignoreTokenIds + A list of token ids which are ignored in the decoder's output, by + default [] + + Notes + ----- + This is a very computationally expensive module especially on larger + sequence. The use of an accelerator such as GPU is recommended. + + References + ---------- + - `MiniCPM: Unveiling the Potential of End-side Large Language Models + ` + - https://github.com/OpenBMB/MiniCPM + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> documentAssembler = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("documents") + >>> cpm = CPMTransformer.pretrained("llama_2_7b_chat_hf_int4") \\ + ... .setInputCols(["documents"]) \\ + ... .setMaxOutputLength(50) \\ + ... .setOutputCol("generation") + >>> pipeline = Pipeline().setStages([documentAssembler, cpm]) + >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("summaries.generation").show(truncate=False) + +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |result | + +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |[My name is Leonardo. I am a student at the University of California, Los Angeles. I have a passion for writing and learning about different cultures. I enjoy playing basketball and watching movies]| + -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + """ + + name = "CPMTransformer" + + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + + outputAnnotatorType = AnnotatorType.DOCUMENT + + configProtoBytes = Param(Params._dummy(), "configProtoBytes", + "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()", + TypeConverters.toListInt) + + minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated", + typeConverter=TypeConverters.toInt) + + maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text", + typeConverter=TypeConverters.toInt) + + doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise", + typeConverter=TypeConverters.toBoolean) + + temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities", + typeConverter=TypeConverters.toFloat) + + topK = Param(Params._dummy(), "topK", + "The number of highest probability vocabulary tokens to keep for top-k-filtering", + typeConverter=TypeConverters.toInt) + + topP = Param(Params._dummy(), "topP", + "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation", + typeConverter=TypeConverters.toFloat) + + repetitionPenalty = Param(Params._dummy(), "repetitionPenalty", + "The parameter for repetition penalty. 1.0 means no penalty. See `this paper `__ for more details", + typeConverter=TypeConverters.toFloat) + + noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize", + "If set to int > 0, all ngrams of that size can only occur once", + typeConverter=TypeConverters.toInt) + + ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds", + "A list of token ids which are ignored in the decoder's output", + typeConverter=TypeConverters.toListInt) + + def setIgnoreTokenIds(self, value): + """A list of token ids which are ignored in the decoder's output. + + Parameters + ---------- + value : List[int] + The words to be filtered out + """ + return self._set(ignoreTokenIds=value) + + def setConfigProtoBytes(self, b): + """Sets configProto from tensorflow, serialized into byte array. + + Parameters + ---------- + b : List[int] + ConfigProto from tensorflow, serialized into byte array + """ + return self._set(configProtoBytes=b) + + def setMinOutputLength(self, value): + """Sets minimum length of the sequence to be generated. + + Parameters + ---------- + value : int + Minimum length of the sequence to be generated + """ + return self._set(minOutputLength=value) + + def setMaxOutputLength(self, value): + """Sets maximum length of output text. + + Parameters + ---------- + value : int + Maximum length of output text + """ + return self._set(maxOutputLength=value) + + def setDoSample(self, value): + """Sets whether or not to use sampling, use greedy decoding otherwise. + + Parameters + ---------- + value : bool + Whether or not to use sampling; use greedy decoding otherwise + """ + return self._set(doSample=value) + + def setTemperature(self, value): + """Sets the value used to module the next token probabilities. + + Parameters + ---------- + value : float + The value used to module the next token probabilities + """ + return self._set(temperature=value) + + def setTopK(self, value): + """Sets the number of highest probability vocabulary tokens to keep for + top-k-filtering. + + Parameters + ---------- + value : int + Number of highest probability vocabulary tokens to keep + """ + return self._set(topK=value) + + def setTopP(self, value): + """Sets the top cumulative probability for vocabulary tokens. + + If set to float < 1, only the most probable tokens with probabilities + that add up to ``topP`` or higher are kept for generation. + + Parameters + ---------- + value : float + Cumulative probability for vocabulary tokens + """ + return self._set(topP=value) + + def setRepetitionPenalty(self, value): + """Sets the parameter for repetition penalty. 1.0 means no penalty. + + Parameters + ---------- + value : float + The repetition penalty + + References + ---------- + See `Ctrl: A Conditional Transformer Language Model For Controllable + Generation `__ for more details. + """ + return self._set(repetitionPenalty=value) + + def setNoRepeatNgramSize(self, value): + """Sets size of n-grams that can only occur once. + + If set to int > 0, all ngrams of that size can only occur once. + + Parameters + ---------- + value : int + N-gram size can only occur once + """ + return self._set(noRepeatNgramSize=value) + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.CPMTransformer", java_model=None): + super(CPMTransformer, self).__init__(classname=classname, java_model=java_model) + self._setDefault(minOutputLength=0, maxOutputLength=50, doSample=False, temperature=0.8, topK=100, topP=0.8, + repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + CPMTransformer + The restored model + """ + from sparknlp.internal import _CPMLoader + jModel = _CPMLoader(folder, spark_session._jsparkSession)._java_obj + return CPMTransformer(java_model=jModel) + + @staticmethod + def pretrained(name="llama_2_7b_chat_hf_int4", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "llama_2_7b_chat_hf_int4" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + CPMTransformer + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(CPMTransformer, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index c1aabeeb36aec0..af29c549735a91 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -110,6 +110,11 @@ def __init__(self, path, jspark): path, jspark) +class _CPMLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_CPMLoader, self).__init__( + "com.johnsnowlabs.nlp.annotators.seq2seq.CPMTransformer.loadSavedModel", path, jspark) + class _DistilBertLoader(ExtendedJavaWrapper): def __init__(self, path, jspark): diff --git a/python/test/annotator/seq2seq/cpm_transformer_test.py b/python/test/annotator/seq2seq/cpm_transformer_test.py new file mode 100644 index 00000000000000..810505cb82a124 --- /dev/null +++ b/python/test/annotator/seq2seq/cpm_transformer_test.py @@ -0,0 +1,46 @@ +# Copyright 2017-2024 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class CPMTransformerTextGenerationTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + + def runTest(self): + data = self.spark.createDataFrame([ + [1, """Leonardo Da Vinci invented the microscope?""".strip().replace("\n", " ")]]).toDF("id", "text") + + document_assembler = DocumentAssembler() \ + .setInputCol("text") \ + .setOutputCol("documents") + + cpm = CPMTransformer \ + .pretrained() \ + .setMaxOutputLength(50) \ + .setDoSample(False) \ + .setInputCols(["documents"]) \ + .setOutputCol("generation") + + pipeline = Pipeline().setStages([document_assembler, cpm]) + results = pipeline.fit(data).transform(data) + + results.select("generation.result").show(truncate=False) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala index ff6ef4d6d23a5e..ab039cf00f48d4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/CPMTransformer.scala @@ -77,17 +77,6 @@ import org.json4s.jackson.JsonMethods._ * - [[https://shengdinghu.notion.site/MiniCPM-Unveiling-the-Potential-of-End-side-Large-Language-Models-d4d3a8c426424654a4e80e42a711cb20 MiniCPM: Unveiling the Potential of End-side Large Language Models]] * - [[https://github.com/OpenBMB/MiniCPM]] * - * '''Paper Abstract:''' - * - * ''In this work, we develop and release Llama 2, a collection of pretrained and fine-tuned - * large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our - * fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models - * outperform open-source chat models on most benchmarks we tested, and based on our human - * evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. - * We provide a detailed description of our approach to fine-tuning and safety improvements of - * Llama 2-Chat in order to enable the community to build on our work and contribute to the - * responsible development of LLMs.'' - * * '''Note:''' * * This is a very computationally expensive module especially on larger sequence. The use of an