diff --git a/build.sbt b/build.sbt index cdbe01697bd207..4f35f22f8ae570 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64) organization := "com.johnsnowlabs.nlp" -version := "5.4.2" +version := "5.5.0" (ThisBuild / scalaVersion) := scalaVer @@ -180,6 +180,16 @@ val onnxDependencies: Seq[sbt.ModuleID] = else Seq(onnxCPU) +val llamaCppDependencies = + if (is_gpu.equals("true")) + Seq(llamaCppGPU) + else if (is_silicon.equals("true")) + Seq(llamaCppSilicon) +// else if (is_aarch64.equals("true")) +// Seq(openVinoCPU) + else + Seq(llamaCppCPU) + val openVinoDependencies: Seq[sbt.ModuleID] = if (is_gpu.equals("true")) Seq(openVinoGPU) @@ -202,6 +212,7 @@ lazy val root = (project in file(".")) utilDependencies ++ tensorflowDependencies ++ onnxDependencies ++ + llamaCppDependencies ++ openVinoDependencies ++ typedDependencyParserDependencies, // TODO potentially improve this? diff --git a/docs/en/annotator_entries/AutoGGUF.md b/docs/en/annotator_entries/AutoGGUF.md new file mode 100644 index 00000000000000..4bf8384004b0e0 --- /dev/null +++ b/docs/en/annotator_entries/AutoGGUF.md @@ -0,0 +1,135 @@ +{%- capture title -%} +AutoGGUFModel +{%- endcapture -%} + +{%- capture description -%} +Annotator that uses the llama.cpp library to generate text completions with large language +models. + +For settable parameters, and their explanations, see [HasLlamaCppProperties](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala) and refer to +the llama.cpp documentation of +[server.cpp](https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server) +for more information. + +If the parameters are not set, the annotator will default to use the parameters provided by +the model. + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val autoGGUFModel = AutoGGUFModel.pretrained() + .setInputCols("document") + .setOutputCol("completions") +``` + +The default model is `"gguf-phi3-mini-4k-instruct-q4"`, if no name is provided. + +For available pretrained models please see the [Models Hub](https://sparknlp.org/models). + +For extended examples of usage, see the +[AutoGGUFModelTest](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala) +and the +[example notebook](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb). + +**Note**: To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set +the number of GPU layers with the `setNGpuLayers` method. + +When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` +according to your hardware to avoid out-of-memory errors. +{%- endcapture -%} + +{%- capture input_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture output_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture python_example -%} +>>> import sparknlp +>>> from sparknlp.base import * +>>> from sparknlp.annotator import * +>>> from pyspark.ml import Pipeline +>>> document = DocumentAssembler() \ +... .setInputCol("text") \ +... .setOutputCol("document") +>>> autoGGUFModel = AutoGGUFModel.pretrained() \ +... .setInputCols(["document"]) \ +... .setOutputCol("completions") \ +... .setBatchSize(4) \ +... .setNPredict(20) \ +... .setNGpuLayers(99) \ +... .setTemperature(0.4) \ +... .setTopK(40) \ +... .setTopP(0.9) \ +... .setPenalizeNl(True) +>>> pipeline = Pipeline().setStages([document, autoGGUFModel]) +>>> data = spark.createDataFrame([["Hello, I am a"]]).toDF("text") +>>> result = pipeline.fit(data).transform(data) +>>> result.select("completions").show(truncate = False) ++-----------------------------------------------------------------------------------------------------------------------------------+ +|completions | ++-----------------------------------------------------------------------------------------------------------------------------------+ +|[{document, 0, 78, new user. I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]| ++-----------------------------------------------------------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline +import spark.implicits._ + +val document = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + +val autoGGUFModel = AutoGGUFModel + .pretrained() + .setInputCols("document") + .setOutputCol("completions") + .setBatchSize(4) + .setNPredict(20) + .setNGpuLayers(99) + .setTemperature(0.4f) + .setTopK(40) + .setTopP(0.9f) + .setPenalizeNl(true) + +val pipeline = new Pipeline().setStages(Array(document, autoGGUFModel)) + +val data = Seq("Hello, I am a").toDF("text") +val result = pipeline.fit(data).transform(data) +result.select("completions").show(truncate = false) ++-----------------------------------------------------------------------------------------------------------------------------------+ +|completions | ++-----------------------------------------------------------------------------------------------------------------------------------+ +|[{document, 0, 78, new user. I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]| ++-----------------------------------------------------------------------------------------------------------------------------------+ + +{%- endcapture -%} + +{%- capture api_link -%} +[AutoGGUFModel](/api/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel) +{%- endcapture -%} + +{%- capture python_api_link -%} +[AutoGGUFModel](/api/python/reference/autosummary/sparknlp/annotator/seq2seq/auto_gguf_model/index.html) +{%- endcapture -%} + +{%- capture source_link -%} +[AutoGGUFModel](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file diff --git a/docs/en/annotators.md b/docs/en/annotators.md index b65eae52cc7f12..161bd8f8e3f496 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -45,6 +45,7 @@ There are two types of Annotators: {:.table-model-big} |Annotator|Description|Version | |---|---|---| +{% include templates/anno_table_entry.md path="" name="AutoGGUFModel" summary="Annotator that uses the llama.cpp library to generate text completions with large language models."%} {% include templates/anno_table_entry.md path="" name="BGEEmbeddings" summary="Sentence embeddings using BGE."%} {% include templates/anno_table_entry.md path="" name="BigTextMatcher" summary="Annotator to match exact phrases (by token) provided in a file against a Document."%} {% include templates/anno_table_entry.md path="" name="Chunk2Doc" summary="Converts a `CHUNK` type column back into `DOCUMENT`. Useful when trying to re-tokenize or do further analysis on a `CHUNK` result."%} diff --git a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb new file mode 100644 index 00000000000000..f07f3892e2d38f --- /dev/null +++ b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb @@ -0,0 +1,628 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb)\n", + "\n", + "# Import llama.cpp 🦙 models into Spark NLP 🚀\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- llama.cpp support was introduced in `Spark NLP 5.5.0`, enabling quantized LLM inference on a wide range of devices. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You need to use your own `.gguf` model files, which also include the models from the [Hugging Face Models](https://huggingface.co/models?library=gguf)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a GGUF Model\n", + "\n", + "Lets download a GGUF model to test it out. For this, we will use [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf). It is a 3.8B parameter model which also is available in 4-bit quantization. \n", + "\n", + "We can download the model by selecting the q4 GGUF file from the \"Files and versions\" tab.\n", + "\n", + "Once downloaded, we can directly import this model into Spark NLP!" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-07-20 11:11:30-- https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf?download=true\n", + "Resolving huggingface.co (huggingface.co)... 2600:9000:275f:7600:17:b174:6d00:93a1, 2600:9000:275f:3800:17:b174:6d00:93a1, 2600:9000:275f:6e00:17:b174:6d00:93a1, ...\n", + "Connecting to huggingface.co (huggingface.co)|2600:9000:275f:7600:17:b174:6d00:93a1|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs-us-1.huggingface.co/repos/41/c8/41c860f65b01de5dc4c68b00d84cead799d3e7c48e38ee749f4c6057776e2e9e/8a83c7fb9049a9b2e92266fa7ad04933bb53aa1e85136b7b30f1b8000ff2edef?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27Phi-3-mini-4k-instruct-q4.gguf%3B+filename%3D%22Phi-3-mini-4k-instruct-q4.gguf%22%3B&Expires=1721725890&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTcyNTg5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzQxL2M4LzQxYzg2MGY2NWIwMWRlNWRjNGM2OGIwMGQ4NGNlYWQ3OTlkM2U3YzQ4ZTM4ZWU3NDlmNGM2MDU3Nzc2ZTJlOWUvOGE4M2M3ZmI5MDQ5YTliMmU5MjI2NmZhN2FkMDQ5MzNiYjUzYWExZTg1MTM2YjdiMzBmMWI4MDAwZmYyZWRlZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=joXQf4QRpEhtFeQ3r3gJ0zyJ3bXReb9OxM%7EZit3GJ3355ycKQzemJ%7E6eD-J7%7EkphnsPpRpUDhQkCr2-Oidqo7dgltmFsWAX4SmQLn65R1yjO%7EsMvi%7E4vOUpaRPYlSMCyWWJpiZZjQYVH4Uk0o-G62ALFXKGaDfr627kvahP-fJYwNNP1riTrH8hbbah28ZKRAQjUGI1aNqerG0jojudnGOagawISAnudkAOFZfxnN7Qw3CoMySZLj9Euu02RBv2A5Yy0uSjG7b8rilx-tU5HDR3ECohdQQ8yPXjYFU-LZi-zcG1wwBDF-S01qb%7EgPWsTorenxfRM2cG6J%7EvSziGCzA__&Key-Pair-Id=K24J24Z295AEI9 [following]\n", + "--2024-07-20 11:11:30-- https://cdn-lfs-us-1.huggingface.co/repos/41/c8/41c860f65b01de5dc4c68b00d84cead799d3e7c48e38ee749f4c6057776e2e9e/8a83c7fb9049a9b2e92266fa7ad04933bb53aa1e85136b7b30f1b8000ff2edef?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27Phi-3-mini-4k-instruct-q4.gguf%3B+filename%3D%22Phi-3-mini-4k-instruct-q4.gguf%22%3B&Expires=1721725890&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTcyNTg5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzQxL2M4LzQxYzg2MGY2NWIwMWRlNWRjNGM2OGIwMGQ4NGNlYWQ3OTlkM2U3YzQ4ZTM4ZWU3NDlmNGM2MDU3Nzc2ZTJlOWUvOGE4M2M3ZmI5MDQ5YTliMmU5MjI2NmZhN2FkMDQ5MzNiYjUzYWExZTg1MTM2YjdiMzBmMWI4MDAwZmYyZWRlZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=joXQf4QRpEhtFeQ3r3gJ0zyJ3bXReb9OxM%7EZit3GJ3355ycKQzemJ%7E6eD-J7%7EkphnsPpRpUDhQkCr2-Oidqo7dgltmFsWAX4SmQLn65R1yjO%7EsMvi%7E4vOUpaRPYlSMCyWWJpiZZjQYVH4Uk0o-G62ALFXKGaDfr627kvahP-fJYwNNP1riTrH8hbbah28ZKRAQjUGI1aNqerG0jojudnGOagawISAnudkAOFZfxnN7Qw3CoMySZLj9Euu02RBv2A5Yy0uSjG7b8rilx-tU5HDR3ECohdQQ8yPXjYFU-LZi-zcG1wwBDF-S01qb%7EgPWsTorenxfRM2cG6J%7EvSziGCzA__&Key-Pair-Id=K24J24Z295AEI9\n", + "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 2600:9000:275f:7a00:17:9a40:4dc0:93a1, 2600:9000:275f:fc00:17:9a40:4dc0:93a1, 2600:9000:275f:4800:17:9a40:4dc0:93a1, ...\n", + "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|2600:9000:275f:7a00:17:9a40:4dc0:93a1|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 2393231072 (2.2G) [binary/octet-stream]\n", + "Saving to: ‘Phi-3-mini-4k-instruct-q4.gguf?download=true’\n", + "\n", + "Phi-3-mini-4k-instr 100%[===================>] 2.23G 22.5MB/s in 96s \n", + "\n", + "2024-07-20 11:13:06 (23.7 MB/s) - ‘Phi-3-mini-4k-instruct-q4.gguf?download=true’ saved [2393231072/2393231072]\n", + "\n" + ] + } + ], + "source": [ + "EXPORT_PATH = \"Phi-3-mini-4k-instruct-q4.gguf\"\n", + "! wget \"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf?download=true\" -O {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save AutGGUF models in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP (if running it Google Colab)\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only execute this if you are on Google Colab\n", + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24/07/21 10:51:01 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.34 instead (on interface enp3s0)\n", + "24/07/21 10:51:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ":: loading settings :: url = jar:file:/home/ducha/mambaforge/envs/sparknlp_dev/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Ivy Default Cache set to: /home/ducha/.ivy2/cache\n", + "The jars for the packages stored in: /home/ducha/.ivy2/jars\n", + "com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency\n", + ":: resolving dependencies :: org.apache.spark#spark-submit-parent-994cb793-bb56-4b46-ad2f-b20d68529970;1.0\n", + "\tconfs: [default]\n", + "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central\n", + "\tfound com.typesafe#config;1.4.2 in central\n", + "\tfound org.rocksdb#rocksdbjni;6.29.5 in central\n", + "\tfound com.amazonaws#aws-java-sdk-s3;1.12.500 in central\n", + "\tfound com.amazonaws#aws-java-sdk-kms;1.12.500 in central\n", + "\tfound com.amazonaws#aws-java-sdk-core;1.12.500 in central\n", + "\tfound commons-logging#commons-logging;1.1.3 in central\n", + "\tfound commons-codec#commons-codec;1.15 in central\n", + "\tfound org.apache.httpcomponents#httpclient;4.5.13 in central\n", + "\tfound org.apache.httpcomponents#httpcore;4.4.13 in central\n", + "\tfound software.amazon.ion#ion-java;1.0.2 in central\n", + "\tfound joda-time#joda-time;2.8.1 in central\n", + "\tfound com.amazonaws#jmespath-java;1.12.500 in central\n", + "\tfound com.github.universal-automata#liblevenshtein;3.0.0 in central\n", + "\tfound com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central\n", + "\tfound com.google.protobuf#protobuf-java;3.0.0-beta-3 in central\n", + "\tfound com.google.code.gson#gson;2.3 in central\n", + "\tfound it.unimi.dsi#fastutil;7.0.12 in central\n", + "\tfound org.projectlombok#lombok;1.16.8 in central\n", + "\tfound com.google.cloud#google-cloud-storage;2.20.1 in central\n", + "\tfound com.google.guava#guava;31.1-jre in central\n", + "\tfound com.google.guava#failureaccess;1.0.1 in central\n", + "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central\n", + "\tfound com.google.errorprone#error_prone_annotations;2.18.0 in central\n", + "\tfound com.google.j2objc#j2objc-annotations;1.3 in central\n", + "\tfound com.google.http-client#google-http-client;1.43.0 in central\n", + "\tfound io.opencensus#opencensus-contrib-http-util;0.31.1 in central\n", + "\tfound com.google.http-client#google-http-client-jackson2;1.43.0 in central\n", + "\tfound com.google.http-client#google-http-client-gson;1.43.0 in central\n", + "\tfound com.google.api-client#google-api-client;2.2.0 in central\n", + "\tfound com.google.oauth-client#google-oauth-client;1.34.1 in central\n", + "\tfound com.google.http-client#google-http-client-apache-v2;1.43.0 in central\n", + "\tfound com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central\n", + "\tfound com.google.code.gson#gson;2.10.1 in central\n", + "\tfound com.google.cloud#google-cloud-core;2.12.0 in central\n", + "\tfound io.grpc#grpc-context;1.53.0 in central\n", + "\tfound com.google.auto.value#auto-value-annotations;1.10.1 in central\n", + "\tfound com.google.auto.value#auto-value;1.10.1 in central\n", + "\tfound javax.annotation#javax.annotation-api;1.3.2 in central\n", + "\tfound com.google.cloud#google-cloud-core-http;2.12.0 in central\n", + "\tfound com.google.http-client#google-http-client-appengine;1.43.0 in central\n", + "\tfound com.google.api#gax-httpjson;0.108.2 in central\n", + "\tfound com.google.cloud#google-cloud-core-grpc;2.12.0 in central\n", + "\tfound io.grpc#grpc-alts;1.53.0 in central\n", + "\tfound io.grpc#grpc-grpclb;1.53.0 in central\n", + "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n", + "\tfound io.grpc#grpc-auth;1.53.0 in central\n", + "\tfound io.grpc#grpc-protobuf;1.53.0 in central\n", + "\tfound io.grpc#grpc-protobuf-lite;1.53.0 in central\n", + "\tfound io.grpc#grpc-core;1.53.0 in central\n", + "\tfound com.google.api#gax;2.23.2 in central\n", + "\tfound com.google.api#gax-grpc;2.23.2 in central\n", + "\tfound com.google.auth#google-auth-library-credentials;1.16.0 in central\n", + "\tfound com.google.auth#google-auth-library-oauth2-http;1.16.0 in central\n", + "\tfound com.google.api#api-common;2.6.2 in central\n", + "\tfound io.opencensus#opencensus-api;0.31.1 in central\n", + "\tfound com.google.api.grpc#proto-google-iam-v1;1.9.2 in central\n", + "\tfound com.google.protobuf#protobuf-java;3.21.12 in central\n", + "\tfound com.google.protobuf#protobuf-java-util;3.21.12 in central\n", + "\tfound com.google.api.grpc#proto-google-common-protos;2.14.2 in central\n", + "\tfound org.threeten#threetenbp;1.6.5 in central\n", + "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central\n", + "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central\n", + "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central\n", + "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n", + "\tfound io.grpc#grpc-api;1.53.0 in central\n", + "\tfound io.grpc#grpc-stub;1.53.0 in central\n", + "\tfound org.checkerframework#checker-qual;3.31.0 in central\n", + "\tfound io.perfmark#perfmark-api;0.26.0 in central\n", + "\tfound com.google.android#annotations;4.1.1.4 in central\n", + "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.22 in central\n", + "\tfound io.opencensus#opencensus-proto;0.2.0 in central\n", + "\tfound io.grpc#grpc-services;1.53.0 in central\n", + "\tfound com.google.re2j#re2j;1.6 in central\n", + "\tfound io.grpc#grpc-netty-shaded;1.53.0 in central\n", + "\tfound io.grpc#grpc-googleapis;1.53.0 in central\n", + "\tfound io.grpc#grpc-xds;1.53.0 in central\n", + "\tfound com.navigamez#greex;1.0 in central\n", + "\tfound dk.brics.automaton#automaton;1.11-8 in central\n", + "\tfound com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central\n", + "\tfound com.microsoft.onnxruntime#onnxruntime;1.17.0 in central\n", + ":: resolution report :: resolve 843ms :: artifacts dl 40ms\n", + "\t:: modules in use:\n", + "\tcom.amazonaws#aws-java-sdk-core;1.12.500 from central in [default]\n", + "\tcom.amazonaws#aws-java-sdk-kms;1.12.500 from central in [default]\n", + "\tcom.amazonaws#aws-java-sdk-s3;1.12.500 from central in [default]\n", + "\tcom.amazonaws#jmespath-java;1.12.500 from central in [default]\n", + "\tcom.github.universal-automata#liblevenshtein;3.0.0 from central in [default]\n", + "\tcom.google.android#annotations;4.1.1.4 from central in [default]\n", + "\tcom.google.api#api-common;2.6.2 from central in [default]\n", + "\tcom.google.api#gax;2.23.2 from central in [default]\n", + "\tcom.google.api#gax-grpc;2.23.2 from central in [default]\n", + "\tcom.google.api#gax-httpjson;0.108.2 from central in [default]\n", + "\tcom.google.api-client#google-api-client;2.2.0 from central in [default]\n", + "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", + "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", + "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n", + "\tcom.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]\n", + "\tcom.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]\n", + "\tcom.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]\n", + "\tcom.google.auth#google-auth-library-credentials;1.16.0 from central in [default]\n", + "\tcom.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]\n", + "\tcom.google.auto.value#auto-value;1.10.1 from central in [default]\n", + "\tcom.google.auto.value#auto-value-annotations;1.10.1 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core;2.12.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-core-http;2.12.0 from central in [default]\n", + "\tcom.google.cloud#google-cloud-storage;2.20.1 from central in [default]\n", + "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n", + "\tcom.google.code.gson#gson;2.10.1 from central in [default]\n", + "\tcom.google.errorprone#error_prone_annotations;2.18.0 from central in [default]\n", + "\tcom.google.guava#failureaccess;1.0.1 from central in [default]\n", + "\tcom.google.guava#guava;31.1-jre from central in [default]\n", + "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]\n", + "\tcom.google.http-client#google-http-client;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-appengine;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-gson;1.43.0 from central in [default]\n", + "\tcom.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]\n", + "\tcom.google.j2objc#j2objc-annotations;1.3 from central in [default]\n", + "\tcom.google.oauth-client#google-oauth-client;1.34.1 from central in [default]\n", + "\tcom.google.protobuf#protobuf-java;3.21.12 from central in [default]\n", + "\tcom.google.protobuf#protobuf-java-util;3.21.12 from central in [default]\n", + "\tcom.google.re2j#re2j;1.6 from central in [default]\n", + "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 from central in [default]\n", + "\tcom.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]\n", + "\tcom.microsoft.onnxruntime#onnxruntime;1.17.0 from central in [default]\n", + "\tcom.navigamez#greex;1.0 from central in [default]\n", + "\tcom.typesafe#config;1.4.2 from central in [default]\n", + "\tcommons-codec#commons-codec;1.15 from central in [default]\n", + "\tcommons-logging#commons-logging;1.1.3 from central in [default]\n", + "\tdk.brics.automaton#automaton;1.11-8 from central in [default]\n", + "\tio.grpc#grpc-alts;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-api;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-auth;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-context;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-core;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-googleapis;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-grpclb;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-netty-shaded;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-protobuf;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-protobuf-lite;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-services;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-stub;1.53.0 from central in [default]\n", + "\tio.grpc#grpc-xds;1.53.0 from central in [default]\n", + "\tio.opencensus#opencensus-api;0.31.1 from central in [default]\n", + "\tio.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]\n", + "\tio.opencensus#opencensus-proto;0.2.0 from central in [default]\n", + "\tio.perfmark#perfmark-api;0.26.0 from central in [default]\n", + "\tit.unimi.dsi#fastutil;7.0.12 from central in [default]\n", + "\tjavax.annotation#javax.annotation-api;1.3.2 from central in [default]\n", + "\tjoda-time#joda-time;2.8.1 from central in [default]\n", + "\torg.apache.httpcomponents#httpclient;4.5.13 from central in [default]\n", + "\torg.apache.httpcomponents#httpcore;4.4.13 from central in [default]\n", + "\torg.checkerframework#checker-qual;3.31.0 from central in [default]\n", + "\torg.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]\n", + "\torg.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]\n", + "\torg.projectlombok#lombok;1.16.8 from central in [default]\n", + "\torg.rocksdb#rocksdbjni;6.29.5 from central in [default]\n", + "\torg.threeten#threetenbp;1.6.5 from central in [default]\n", + "\tsoftware.amazon.ion#ion-java;1.0.2 from central in [default]\n", + "\t:: evicted modules:\n", + "\tcommons-logging#commons-logging;1.2 by [commons-logging#commons-logging;1.1.3] in [default]\n", + "\tcommons-codec#commons-codec;1.11 by [commons-codec#commons-codec;1.15] in [default]\n", + "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]\n", + "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]\n", + "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]\n", + "\t---------------------------------------------------------------------\n", + "\t| | modules || artifacts |\n", + "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", + "\t---------------------------------------------------------------------\n", + "\t| default | 83 | 0 | 0 | 5 || 78 | 0 |\n", + "\t---------------------------------------------------------------------\n", + ":: retrieving :: org.apache.spark#spark-submit-parent-994cb793-bb56-4b46-ad2f-b20d68529970\n", + "\tconfs: [default]\n", + "\t0 artifacts copied, 78 already retrieved (0kB/22ms)\n", + "24/07/21 10:51:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP with GPU enabled. If you don't have GPUs available remove this parameter.\n", + "spark = sparknlp.start(gpu=True)\n", + "print(sparknlp.version())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use the `loadSavedModel` functon in `AutoGGUFModel`\n", + "- Most params will be set automatically. They can also be set later after loading the model in `AutoGGUFModel` during runtime, so don't worry about setting them now.\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracted 'libllama.so' to '/tmp/libllama.so'\n", + "Extracted 'libjllama.so' to '/tmp/libjllama.so'\n" + ] + } + ], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "autoGGUFModel = (\n", + " AutoGGUFModel.loadSavedModel(EXPORT_PATH, spark)\n", + " .setInputCols(\"document\")\n", + " .setOutputCol(\"completions\")\n", + " .setBatchSize(4)\n", + " .setNPredict(20)\n", + " .setNGpuLayers(99)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "autoGGUFModel.write().overwrite().save(f\"Phi-3-mini-4k-instruct-q4_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your GGUF model from loaded and saved by Spark NLP 🚀" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2337168\n", + "drwxr-xr-x 2 ducha ducha 4096 Jul 21 16:24 metadata\n", + "-rwxrwxr-x 1 ducha ducha 2393231072 Jul 21 16:24 Phi-3-mini-4k-instruct-q4.gguf\n" + ] + } + ], + "source": [ + "! ls -l Phi-3-mini-4k-instruct-q4_spark_nlp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny GGUF model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] build info build=3008 commit=\"1d8fca72\"\n", + "[INFO] system info n_threads=6 n_threads_batch=-1 total_threads=6 system_info=\"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /tmp/spark-bbad4f64-91a7-4b6e-8242-7f91e6abca54/userFiles-f7d4e4e9-c02d-46e4-81b5-bf5a26d70930/Phi-3-mini-4k-instruct-q4.gguf (version GGUF V3 (latest))\n", + "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", + "llama_model_loader: - kv 0: general.architecture str = phi3\n", + "llama_model_loader: - kv 1: general.name str = Phi3\n", + "llama_model_loader: - kv 2: phi3.context_length u32 = 4096\n", + "llama_model_loader: - kv 3: phi3.embedding_length u32 = 3072\n", + "llama_model_loader: - kv 4: phi3.feed_forward_length u32 = 8192\n", + "llama_model_loader: - kv 5: phi3.block_count u32 = 32\n", + "llama_model_loader: - kv 6: phi3.attention.head_count u32 = 32\n", + "llama_model_loader: - kv 7: phi3.attention.head_count_kv u32 = 32\n", + "llama_model_loader: - kv 8: phi3.attention.layer_norm_rms_epsilon f32 = 0.000010\n", + "llama_model_loader: - kv 9: phi3.rope.dimension_count u32 = 96\n", + "llama_model_loader: - kv 10: general.file_type u32 = 15\n", + "llama_model_loader: - kv 11: tokenizer.ggml.model str = llama\n", + "llama_model_loader: - kv 12: tokenizer.ggml.pre str = default\n", + "llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32064] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n", + "llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32064] = [0.000000, 0.000000, 0.000000, 0.0000...\n", + "llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32064] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n", + "llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n", + "llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 32000\n", + "llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0\n", + "llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 32000\n", + "llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n", + "llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n", + "llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...\n", + "llama_model_loader: - kv 23: general.quantization_version u32 = 2\n", + "llama_model_loader: - type f32: 65 tensors\n", + "llama_model_loader: - type q4_K: 81 tensors\n", + "llama_model_loader: - type q5_K: 32 tensors\n", + "llama_model_loader: - type q6_K: 17 tensors\n", + "llm_load_vocab: special tokens definition check successful ( 323/32064 ).\n", + "llm_load_print_meta: format = GGUF V3 (latest)\n", + "llm_load_print_meta: arch = phi3\n", + "llm_load_print_meta: vocab type = SPM\n", + "llm_load_print_meta: n_vocab = 32064\n", + "llm_load_print_meta: n_merges = 0\n", + "llm_load_print_meta: n_ctx_train = 4096\n", + "llm_load_print_meta: n_embd = 3072\n", + "llm_load_print_meta: n_head = 32\n", + "llm_load_print_meta: n_head_kv = 32\n", + "llm_load_print_meta: n_layer = 32\n", + "llm_load_print_meta: n_rot = 96\n", + "llm_load_print_meta: n_embd_head_k = 96\n", + "llm_load_print_meta: n_embd_head_v = 96\n", + "llm_load_print_meta: n_gqa = 1\n", + "llm_load_print_meta: n_embd_k_gqa = 3072\n", + "llm_load_print_meta: n_embd_v_gqa = 3072\n", + "llm_load_print_meta: f_norm_eps = 0.0e+00\n", + "llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n", + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", + "llm_load_print_meta: f_logit_scale = 0.0e+00\n", + "llm_load_print_meta: n_ff = 8192\n", + "llm_load_print_meta: n_expert = 0\n", + "llm_load_print_meta: n_expert_used = 0\n", + "llm_load_print_meta: causal attn = 1\n", + "llm_load_print_meta: pooling type = 0\n", + "llm_load_print_meta: rope type = 2\n", + "llm_load_print_meta: rope scaling = linear\n", + "llm_load_print_meta: freq_base_train = 10000.0\n", + "llm_load_print_meta: freq_scale_train = 1\n", + "llm_load_print_meta: n_yarn_orig_ctx = 4096\n", + "llm_load_print_meta: rope_finetuned = unknown\n", + "llm_load_print_meta: ssm_d_conv = 0\n", + "llm_load_print_meta: ssm_d_inner = 0\n", + "llm_load_print_meta: ssm_d_state = 0\n", + "llm_load_print_meta: ssm_dt_rank = 0\n", + "llm_load_print_meta: model type = 3B\n", + "llm_load_print_meta: model ftype = Q4_K - Medium\n", + "llm_load_print_meta: model params = 3.82 B\n", + "llm_load_print_meta: model size = 2.23 GiB (5.01 BPW) \n", + "llm_load_print_meta: general.name = Phi3\n", + "llm_load_print_meta: BOS token = 1 ''\n", + "llm_load_print_meta: EOS token = 32000 '<|endoftext|>'\n", + "llm_load_print_meta: UNK token = 0 ''\n", + "llm_load_print_meta: PAD token = 32000 '<|endoftext|>'\n", + "llm_load_print_meta: LF token = 13 '<0x0A>'\n", + "llm_load_print_meta: EOT token = 32007 '<|end|>'\n", + "ggml_cuda_init: failed to initialize CUDA: unknown error\n", + "llm_load_tensors: ggml ctx size = 0.11 MiB\n", + "llm_load_tensors: offloading 32 repeating layers to GPU\n", + "llm_load_tensors: offloading non-repeating layers to GPU\n", + "llm_load_tensors: offloaded 33/33 layers to GPU\n", + "llm_load_tensors: CPU buffer size = 2281.66 MiB\n", + "...........................................................................................\n", + "llama_new_context_with_model: n_ctx = 512\n", + "llama_new_context_with_model: n_batch = 512\n", + "llama_new_context_with_model: n_ubatch = 512\n", + "llama_new_context_with_model: flash_attn = 0\n", + "llama_new_context_with_model: freq_base = 10000.0\n", + "llama_new_context_with_model: freq_scale = 1\n", + "ggml_cuda_host_malloc: failed to allocate 192.00 MiB of pinned memory: unknown error\n", + "llama_kv_cache_init: CPU KV buffer size = 192.00 MiB\n", + "llama_new_context_with_model: KV self size = 192.00 MiB, K (f16): 96.00 MiB, V (f16): 96.00 MiB\n", + "ggml_cuda_host_malloc: failed to allocate 0.61 MiB of pinned memory: unknown error\n", + "llama_new_context_with_model: CPU output buffer size = 0.61 MiB\n", + "ggml_cuda_host_malloc: failed to allocate 83.01 MiB of pinned memory: unknown error\n", + "llama_new_context_with_model: CUDA_Host compute buffer size = 83.01 MiB\n", + "llama_new_context_with_model: graph nodes = 1286\n", + "llama_new_context_with_model: graph splits = 1\n", + "[Stage 5:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] initializing slots n_slots=4\n", + "[INFO] new slot id_slot=0 n_ctx_slot=128\n", + "[INFO] new slot id_slot=1 n_ctx_slot=128\n", + "[INFO] new slot id_slot=2 n_ctx_slot=128\n", + "[INFO] new slot id_slot=3 n_ctx_slot=128\n", + "[INFO] model loaded\n", + "[INFO] chat template chat_example=\"<|system|>\\nYou are a helpful assistant<|end|>\\n<|user|>\\nHello<|end|>\\n<|assistant|>\\nHi there<|end|>\\n<|user|>\\nHow are you?<|end|>\\n<|assistant|>\\n\" built_in=true\n", + "[INFO] all slots are idle\n", + "[INFO] slot is processing task id_slot=0 id_task=0\n", + "[INFO] kv cache rm [p0, end) id_slot=0 id_task=0 p0=0\n", + "[INFO] prompt eval time = 318.87 ms / 5 tokens ( 63.77 ms per token, 15.68 tokens per second) id_slot=0 id_task=0 t_prompt_processing=318.873 n_prompt_tokens_processed=5 t_token=63.7746 n_tokens_second=15.680223788153905\n", + "[INFO] generation eval time = 4136.03 ms / 20 runs ( 206.80 ms per token, 4.84 tokens per second) id_slot=0 id_task=0 t_token_generation=4136.032 n_decoded=20 t_token=206.8016 n_tokens_second=4.835552529574239\n", + "[INFO] total time = 4454.90 ms id_slot=0 id_task=0 t_prompt_processing=318.873 t_token_generation=4136.032 t_total=4454.905\n", + "[INFO] slot released id_slot=0 id_task=0 n_ctx=512 n_past=24 n_system_tokens=0 n_cache_tokens=0 truncated=false\n", + "[INFO] all slots are idle\n", + "+------------------------------------------------------------------------------------------------------------+\n", + "|completions |\n", + "+------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 78, 384,000 kilometers away from the Earth. Use scientific notation to express this, {}, []}]|\n", + "+------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", + "\n", + "auto_gguf_model = AutoGGUFModel.load(\"Phi-3-mini-4k-instruct-q4_spark_nlp\")\n", + "\n", + "pipeline = Pipeline().setStages([document_assembler, auto_gguf_model])\n", + "\n", + "data = spark.createDataFrame([[\"The moon is \"]]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(data).transform(data)\n", + "result.select(\"completions\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of GGUF models from HuggingFace 🤗 in Spark NLP 🚀\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/project/Dependencies.scala b/project/Dependencies.scala index e44934a3e06d47..202fb048044dea 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -128,5 +128,10 @@ object Dependencies { val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided + val llamaCppVersion = "0.1.1-rc2" + val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion + val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion + val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion + /** ------- Dependencies end ------- */ } diff --git a/python/setup.py b/python/setup.py index 1a41299ee3ab5c..cebe55084f4427 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,7 +41,7 @@ # project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='5.4.2', # Required + version='5.5.0', # Required # This is a one-line description or tagline of what your project does. This # corresponds to the 'Summary' metadata field: diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py index 69ee444e14d00a..e9c3984c21ecc1 100644 --- a/python/sparknlp/annotator/seq2seq/__init__.py +++ b/python/sparknlp/annotator/seq2seq/__init__.py @@ -21,9 +21,10 @@ from sparknlp.annotator.seq2seq.m2m100_transformer import * from sparknlp.annotator.seq2seq.phi2_transformer import * from sparknlp.annotator.seq2seq.mistral_transformer import * +from sparknlp.annotator.seq2seq.auto_gguf_model import * from sparknlp.annotator.seq2seq.phi3_transformer import * from sparknlp.annotator.seq2seq.nllb_transformer import * from sparknlp.annotator.seq2seq.cpm_transformer import * from sparknlp.annotator.seq2seq.qwen_transformer import * from sparknlp.annotator.seq2seq.starcoder_transformer import * -from sparknlp.annotator.seq2seq.llama3_transformer import * \ No newline at end of file +from sparknlp.annotator.seq2seq.llama3_transformer import * diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py new file mode 100755 index 00000000000000..4441d1d8c6e88b --- /dev/null +++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py @@ -0,0 +1,804 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for the AutoGGUFModel.""" +from typing import List, Dict + +from sparknlp.common import * + + +class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate): + """ + Annotator that uses the llama.cpp library to generate text completions with large language + models. + + For settable parameters, and their explanations, see the parameters of this class and refer to + the llama.cpp documentation of + `server.cpp `__ + for more information. + + If the parameters are not set, the annotator will default to use the parameters provided by + the model. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> auto_gguf_model = AutoGGUFModel.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("completions") + + The default model is ``"gguf-phi3-mini-4k-instruct-q4"``, if no name is provided. + + For extended examples of usage, see the + `AutoGGUFModelTest `__ + and the + `example notebook `__. + + For available pretrained models please see the `Models Hub `__. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT`` ``DOCUMENT`` + ====================== ====================== + + Parameters + ---------- + nThreads + Set the number of threads to use during generation + nThreadsDraft + Set the number of threads to use during draft generation + nThreadsBatch + Set the number of threads to use during batch and prompt processing + nThreadsBatchDraft + Set the number of threads to use during batch and prompt processing + nCtx + Set the size of the prompt context + nBatch + Set the logical batch size for prompt processing (must be >=32 to use BLAS) + nUbatch + Set the physical batch size for prompt processing (must be >=32 to use BLAS) + nDraft + Set the number of tokens to draft for speculative decoding + nChunks + Set the maximal number of chunks to process + nSequences + Set the number of sequences to decode + pSplit + Set the speculative decoding split probability + nGpuLayers + Set the number of layers to store in VRAM (-1 - use default) + nGpuLayersDraft + Set the number of layers to store in VRAM for the draft model (-1 - use default) + gpuSplitMode + Set how to split the model across GPUs + mainGpu + Set the main GPU that is used for scratch and small tensors. + tensorSplit + Set how split tensors should be distributed across GPUs + grpAttnN + Set the group-attention factor + grpAttnW + Set the group-attention width + ropeFreqBase + Set the RoPE base frequency, used by NTK-aware scaling + ropeFreqScale + Set the RoPE frequency scaling factor, expands context by a factor of 1/N + yarnExtFactor + Set the YaRN extrapolation mix factor + yarnAttnFactor + Set the YaRN scale sqrt(t) or attention magnitude + yarnBetaFast + Set the YaRN low correction dim or beta + yarnBetaSlow + Set the YaRN high correction dim or alpha + yarnOrigCtx + Set the YaRN original context size of model + defragmentationThreshold + Set the KV cache defragmentation threshold + numaStrategy + Set optimization strategies that help on some NUMA systems (if available) + ropeScalingType + Set the RoPE frequency scaling method, defaults to linear unless specified by the model + poolingType + Set the pooling type for embeddings, use model default if unspecified + modelDraft + Set the draft model for speculative decoding + modelAlias + Set a model alias + lookupCacheStaticFilePath + Set path to static lookup cache to use for lookup decoding (not updated by generation) + lookupCacheDynamicFilePath + Set path to dynamic lookup cache to use for lookup decoding (updated by generation) + embedding + Whether to load model with embedding support + flashAttention + Whether to enable Flash Attention + inputPrefixBos + Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string + useMmap + Whether to use memory-map model (faster load but may increase pageouts if not using mlock) + useMlock + Whether to force the system to keep model in RAM rather than swapping or compressing + noKvOffload + Whether to disable KV offload + systemPrompt + Set a system prompt to use + chatTemplate + The chat template to use + inputPrefix + Set the prompt to start generation with + inputSuffix + Set a suffix for infilling + cachePrompt + Whether to remember the prompt to avoid reprocessing it + nPredict + Set the number of tokens to predict + topK + Set top-k sampling + topP + Set top-p sampling + minP + Set min-p sampling + tfsZ + Set tail free sampling, parameter z + typicalP + Set locally typical sampling, parameter p + temperature + Set the temperature + dynatempRange + Set the dynamic temperature range + dynatempExponent + Set the dynamic temperature exponent + repeatLastN + Set the last n tokens to consider for penalties + repeatPenalty + Set the penalty of repeated sequences of tokens + frequencyPenalty + Set the repetition alpha frequency penalty + presencePenalty + Set the repetition alpha presence penalty + miroStat + Set MiroStat sampling strategies. + mirostatTau + Set the MiroStat target entropy, parameter tau + mirostatEta + Set the MiroStat learning rate, parameter eta + penalizeNl + Whether to penalize newline tokens + nKeep + Set the number of tokens to keep from the initial prompt + seed + Set the RNG seed + nProbs + Set the amount top tokens probabilities to output if greater than 0. + minKeep + Set the amount of tokens the samplers should return at least (0 = disabled) + grammar + Set BNF-like grammar to constrain generations + penaltyPrompt + Override which part of the prompt is penalized for repetition. + ignoreEos + Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf) + disableTokenIds + Set the token ids to disable in the completion + stopStrings + Set strings upon seeing which token generation is stopped + samplers + Set which samplers to use for token generation in the given order + useChatTemplate + Set whether or not generate should apply a chat template + + + Notes + ----- + To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set + the number of GPU layers with the `setNGpuLayers` method. + + When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` + according to your hardware to avoid out-of-memory errors. + + References + ---------- + - `Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension + `__ + - https://github.com/pytorch/fairseq + + **Paper Abstract:** + *We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. + BART is trained by (1) corrupting text with an arbitrary noising function, and (2) + learning a model to reconstruct the original text. It uses a standard Tranformer-based + neural machine translation architecture which, despite its simplicity, can be seen as + generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), + and many other more recent pretraining schemes. We evaluate a number of noising approaches, + finding the best performance by both randomly shuffling the order of the original sentences + and using a novel in-filling scheme, where spans of text are replaced with a single mask token. + BART is particularly effective when fine tuned for text generation but also works well for + comprehension tasks. It matches the performance of RoBERTa with comparable training resources + on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, + question answering, and summarization tasks, with gains of up to 6 ROUGE. BART also provides + a 1.1 BLEU increase over a back-translation system for machine translation, with only target + language pretraining. We also report ablation experiments that replicate other pretraining + schemes within the BART framework, to better measure which factors most influence end-task performance.* + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> document = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> autoGGUFModel = AutoGGUFModel.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("completions") \\ + ... .setBatchSize(4) \\ + ... .setNPredict(20) \\ + ... .setNGpuLayers(99) \\ + ... .setTemperature(0.4) \\ + ... .setTopK(40) \\ + ... .setTopP(0.9) \\ + ... .setPenalizeNl(True) + >>> pipeline = Pipeline().setStages([document, autoGGUFModel]) + >>> data = spark.createDataFrame([["Hello, I am a"]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("completions").show(truncate = False) + +-----------------------------------------------------------------------------------------------------------------------------------+ + |completions | + +-----------------------------------------------------------------------------------------------------------------------------------+ + |[{document, 0, 78, new user. I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]| + +-----------------------------------------------------------------------------------------------------------------------------------+ + """ + + name = "AutoGGUFModel" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + outputAnnotatorType = AnnotatorType.DOCUMENT + + # -------- MODEl PARAMETERS -------- + nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation", + typeConverter=TypeConverters.toInt) + nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation", + typeConverter=TypeConverters.toInt) + nThreadsBatch = Param(Params._dummy(), "nThreadsBatch", + "Set the number of threads to use during batch and prompt processing", + typeConverter=TypeConverters.toInt) + nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft", + "Set the number of threads to use during batch and prompt processing", + typeConverter=TypeConverters.toInt) + nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt) + nBatch = Param(Params._dummy(), "nBatch", + "Set the logical batch size for prompt processing (must be >=32 to use BLAS)", + typeConverter=TypeConverters.toInt) + nUbatch = Param(Params._dummy(), "nUbatch", + "Set the physical batch size for prompt processing (must be >=32 to use BLAS)", + typeConverter=TypeConverters.toInt) + nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding", + typeConverter=TypeConverters.toInt) + nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process", + typeConverter=TypeConverters.toInt) + nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode", + typeConverter=TypeConverters.toInt) + pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability", + typeConverter=TypeConverters.toFloat) + nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)", + typeConverter=TypeConverters.toInt) + nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft", + "Set the number of layers to store in VRAM for the draft model (-1 - use default)", + typeConverter=TypeConverters.toInt) + # Set how to split the model across GPUs + # + # - NONE: No GPU split + # - LAYER: Split the model across GPUs by layer + # - ROW: Split the model across GPUs by rows + gpuSplitMode = Param(Params._dummy(), "gpuSplitMode", "Set how to split the model across GPUs", + typeConverter=TypeConverters.toString) + mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.", + typeConverter=TypeConverters.toInt) + tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs", + typeConverter=TypeConverters.toListFloat) + grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt) + grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt) + ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling", + typeConverter=TypeConverters.toFloat) + ropeFreqScale = Param(Params._dummy(), "ropeFreqScale", + "Set the RoPE frequency scaling factor, expands context by a factor of 1/N", + typeConverter=TypeConverters.toFloat) + yarnExtFactor = Param(Params._dummy(), "yarnExtFactor", "Set the YaRN extrapolation mix factor", + typeConverter=TypeConverters.toFloat) + yarnAttnFactor = Param(Params._dummy(), "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude", + typeConverter=TypeConverters.toFloat) + yarnBetaFast = Param(Params._dummy(), "yarnBetaFast", "Set the YaRN low correction dim or beta", + typeConverter=TypeConverters.toFloat) + yarnBetaSlow = Param(Params._dummy(), "yarnBetaSlow", "Set the YaRN high correction dim or alpha", + typeConverter=TypeConverters.toFloat) + yarnOrigCtx = Param(Params._dummy(), "yarnOrigCtx", "Set the YaRN original context size of model", + typeConverter=TypeConverters.toInt) + defragmentationThreshold = Param(Params._dummy(), "defragmentationThreshold", + "Set the KV cache defragmentation threshold", typeConverter=TypeConverters.toFloat) + # Set optimization strategies that help on some NUMA systems (if available) + # + # Available Strategies: + # + # - DISABLED: No NUMA optimizations + # - DISTRIBUTE: Spread execution evenly over all + # - ISOLATE: Only spawn threads on CPUs on the node that execution started on + # - NUMA_CTL: Use the CPU map provided by numactl + # - MIRROR: Mirrors the model across NUMA nodes + numaStrategy = Param(Params._dummy(), "numaStrategy", + "Set optimization strategies that help on some NUMA systems (if available)", + typeConverter=TypeConverters.toString) + # Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + # + # - UNSPECIFIED: Don't use any scaling + # - LINEAR: Linear scaling + # - YARN: YaRN RoPE scaling + ropeScalingType = Param(Params._dummy(), "ropeScalingType", + "Set the RoPE frequency scaling method, defaults to linear unless specified by the model", + typeConverter=TypeConverters.toString) + # Set the pooling type for embeddings, use model default if unspecified + # + # - 0 UNSPECIFIED: Don't use any pooling + # - 1 MEAN: Mean Pooling + # - 2 CLS: CLS Pooling + poolingType = Param(Params._dummy(), "poolingType", + "Set the pooling type for embeddings, use model default if unspecified", + typeConverter=TypeConverters.toString) + modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding", + typeConverter=TypeConverters.toString) + modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString) + lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath", + "Set path to static lookup cache to use for lookup decoding (not updated by generation)", + typeConverter=TypeConverters.toString) + lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath", + "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)", + typeConverter=TypeConverters.toString) + # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") + embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support", + typeConverter=TypeConverters.toBoolean) + flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention", + typeConverter=TypeConverters.toBoolean) + inputPrefixBos = Param(Params._dummy(), "inputPrefixBos", + "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string", + typeConverter=TypeConverters.toBoolean) + useMmap = Param(Params._dummy(), "useMmap", + "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)", + typeConverter=TypeConverters.toBoolean) + useMlock = Param(Params._dummy(), "useMlock", + "Whether to force the system to keep model in RAM rather than swapping or compressing", + typeConverter=TypeConverters.toBoolean) + noKvOffload = Param(Params._dummy(), "noKvOffload", "Whether to disable KV offload", + typeConverter=TypeConverters.toBoolean) + systemPrompt = Param(Params._dummy(), "systemPrompt", "Set a system prompt to use", + typeConverter=TypeConverters.toString) + chatTemplate = Param(Params._dummy(), "chatTemplate", "The chat template to use", + typeConverter=TypeConverters.toString) + + # -------- INFERENCE PARAMETERS -------- + inputPrefix = Param(Params._dummy(), "inputPrefix", "Set the prompt to start generation with", + typeConverter=TypeConverters.toString) + inputSuffix = Param(Params._dummy(), "inputSuffix", "Set a suffix for infilling", + typeConverter=TypeConverters.toString) + cachePrompt = Param(Params._dummy(), "cachePrompt", "Whether to remember the prompt to avoid reprocessing it", + typeConverter=TypeConverters.toBoolean) + nPredict = Param(Params._dummy(), "nPredict", "Set the number of tokens to predict", + typeConverter=TypeConverters.toInt) + topK = Param(Params._dummy(), "topK", "Set top-k sampling", typeConverter=TypeConverters.toInt) + topP = Param(Params._dummy(), "topP", "Set top-p sampling", typeConverter=TypeConverters.toFloat) + minP = Param(Params._dummy(), "minP", "Set min-p sampling", typeConverter=TypeConverters.toFloat) + tfsZ = Param(Params._dummy(), "tfsZ", "Set tail free sampling, parameter z", typeConverter=TypeConverters.toFloat) + typicalP = Param(Params._dummy(), "typicalP", "Set locally typical sampling, parameter p", + typeConverter=TypeConverters.toFloat) + temperature = Param(Params._dummy(), "temperature", "Set the temperature", typeConverter=TypeConverters.toFloat) + dynamicTemperatureRange = Param(Params._dummy(), "dynatempRange", "Set the dynamic temperature range", + typeConverter=TypeConverters.toFloat) + dynamicTemperatureExponent = Param(Params._dummy(), "dynatempExponent", "Set the dynamic temperature exponent", + typeConverter=TypeConverters.toFloat) + repeatLastN = Param(Params._dummy(), "repeatLastN", "Set the last n tokens to consider for penalties", + typeConverter=TypeConverters.toInt) + repeatPenalty = Param(Params._dummy(), "repeatPenalty", "Set the penalty of repeated sequences of tokens", + typeConverter=TypeConverters.toFloat) + frequencyPenalty = Param(Params._dummy(), "frequencyPenalty", "Set the repetition alpha frequency penalty", + typeConverter=TypeConverters.toFloat) + presencePenalty = Param(Params._dummy(), "presencePenalty", "Set the repetition alpha presence penalty", + typeConverter=TypeConverters.toFloat) + miroStat = Param(Params._dummy(), "miroStat", "Set MiroStat sampling strategies.", + typeConverter=TypeConverters.toString) + miroStatTau = Param(Params._dummy(), "mirostatTau", "Set the MiroStat target entropy, parameter tau", + typeConverter=TypeConverters.toFloat) + miroStatEta = Param(Params._dummy(), "mirostatEta", "Set the MiroStat learning rate, parameter eta", + typeConverter=TypeConverters.toFloat) + penalizeNl = Param(Params._dummy(), "penalizeNl", "Whether to penalize newline tokens", + typeConverter=TypeConverters.toBoolean) + nKeep = Param(Params._dummy(), "nKeep", "Set the number of tokens to keep from the initial prompt", + typeConverter=TypeConverters.toInt) + seed = Param(Params._dummy(), "seed", "Set the RNG seed", typeConverter=TypeConverters.toInt) + nProbs = Param(Params._dummy(), "nProbs", "Set the amount top tokens probabilities to output if greater than 0.", + typeConverter=TypeConverters.toInt) + minKeep = Param(Params._dummy(), "minKeep", + "Set the amount of tokens the samplers should return at least (0 = disabled)", + typeConverter=TypeConverters.toInt) + grammar = Param(Params._dummy(), "grammar", "Set BNF-like grammar to constrain generations", + typeConverter=TypeConverters.toString) + penaltyPrompt = Param(Params._dummy(), "penaltyPrompt", + "Override which part of the prompt is penalized for repetition.", + typeConverter=TypeConverters.toString) + ignoreEos = Param(Params._dummy(), "ignoreEos", + "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)", + typeConverter=TypeConverters.toBoolean) + disableTokenIds = Param(Params._dummy(), "disableTokenIds", "Set the token ids to disable in the completion", + typeConverter=TypeConverters.toListInt) + stopStrings = Param(Params._dummy(), "stopStrings", "Set strings upon seeing which token generation is stopped", + typeConverter=TypeConverters.toListString) + samplers = Param(Params._dummy(), "samplers", "Set which samplers to use for token generation in the given order", + typeConverter=TypeConverters.toListString) + useChatTemplate = Param(Params._dummy(), "useChatTemplate", + "Set whether or not generate should apply a chat template", + typeConverter=TypeConverters.toBoolean) + + # -------- MODEL SETTERS -------- + def setNThreads(self, nThreads: int): + """Set the number of threads to use during generation""" + return self._set(nThreads=nThreads) + + def setNThreadsDraft(self, nThreadsDraft: int): + """Set the number of threads to use during draft generation""" + return self._set(nThreadsDraft=nThreadsDraft) + + def setNThreadsBatch(self, nThreadsBatch: int): + """Set the number of threads to use during batch and prompt processing""" + return self._set(nThreadsBatch=nThreadsBatch) + + def setNThreadsBatchDraft(self, nThreadsBatchDraft: int): + """Set the number of threads to use during batch and prompt processing""" + return self._set(nThreadsBatchDraft=nThreadsBatchDraft) + + def setNCtx(self, nCtx: int): + """Set the size of the prompt context""" + return self._set(nCtx=nCtx) + + def setNBatch(self, nBatch: int): + """Set the logical batch size for prompt processing (must be >=32 to use BLAS)""" + return self._set(nBatch=nBatch) + + def setNUbatch(self, nUbatch: int): + """Set the physical batch size for prompt processing (must be >=32 to use BLAS)""" + return self._set(nUbatch=nUbatch) + + def setNDraft(self, nDraft: int): + """Set the number of tokens to draft for speculative decoding""" + return self._set(nDraft=nDraft) + + def setNChunks(self, nChunks: int): + """Set the maximal number of chunks to process""" + return self._set(nChunks=nChunks) + + def setNSequences(self, nSequences: int): + """Set the number of sequences to decode""" + return self._set(nSequences=nSequences) + + def setPSplit(self, pSplit: float): + """Set the speculative decoding split probability""" + return self._set(pSplit=pSplit) + + def setNGpuLayers(self, nGpuLayers: int): + """Set the number of layers to store in VRAM (-1 - use default)""" + return self._set(nGpuLayers=nGpuLayers) + + def setNGpuLayersDraft(self, nGpuLayersDraft: int): + """Set the number of layers to store in VRAM for the draft model (-1 - use default)""" + return self._set(nGpuLayersDraft=nGpuLayersDraft) + + def setGpuSplitMode(self, gpuSplitMode: str): + """Set how to split the model across GPUs""" + return self._set(gpuSplitMode=gpuSplitMode) + + def setMainGpu(self, mainGpu: int): + """Set the main GPU that is used for scratch and small tensors.""" + return self._set(mainGpu=mainGpu) + + def setTensorSplit(self, tensorSplit: List[float]): + """Set how split tensors should be distributed across GPUs""" + return self._set(tensorSplit=tensorSplit) + + def setGrpAttnN(self, grpAttnN: int): + """Set the group-attention factor""" + return self._set(grpAttnN=grpAttnN) + + def setGrpAttnW(self, grpAttnW: int): + """Set the group-attention width""" + return self._set(grpAttnW=grpAttnW) + + def setRopeFreqBase(self, ropeFreqBase: float): + """Set the RoPE base frequency, used by NTK-aware scaling""" + return self._set(ropeFreqBase=ropeFreqBase) + + def setRopeFreqScale(self, ropeFreqScale: float): + """Set the RoPE frequency scaling factor, expands context by a factor of 1/N""" + return self._set(ropeFreqScale=ropeFreqScale) + + def setYarnExtFactor(self, yarnExtFactor: float): + """Set the YaRN extrapolation mix factor""" + return self._set(yarnExtFactor=yarnExtFactor) + + def setYarnAttnFactor(self, yarnAttnFactor: float): + """Set the YaRN scale sqrt(t) or attention magnitude""" + return self._set(yarnAttnFactor=yarnAttnFactor) + + def setYarnBetaFast(self, yarnBetaFast: float): + """Set the YaRN low correction dim or beta""" + return self._set(yarnBetaFast=yarnBetaFast) + + def setYarnBetaSlow(self, yarnBetaSlow: float): + """Set the YaRN high correction dim or alpha""" + return self._set(yarnBetaSlow=yarnBetaSlow) + + def setYarnOrigCtx(self, yarnOrigCtx: int): + """Set the YaRN original context size of model""" + return self._set(yarnOrigCtx=yarnOrigCtx) + + def setDefragmentationThreshold(self, defragmentationThreshold: float): + """Set the KV cache defragmentation threshold""" + return self._set(defragmentationThreshold=defragmentationThreshold) + + def setNumaStrategy(self, numaStrategy: str): + """Set optimization strategies that help on some NUMA systems (if available)""" + return self._set(numaStrategy=numaStrategy) + + def setRopeScalingType(self, ropeScalingType: str): + """Set the RoPE frequency scaling method, defaults to linear unless specified by the model""" + return self._set(ropeScalingType=ropeScalingType) + + def setPoolingType(self, poolingType: bool): + """Set the pooling type for embeddings, use model default if unspecified""" + return self._set(poolingType=poolingType) + + def setModelDraft(self, modelDraft: str): + """Set the draft model for speculative decoding""" + return self._set(modelDraft=modelDraft) + + def setModelAlias(self, modelAlias: str): + """Set a model alias""" + return self._set(modelAlias=modelAlias) + + def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str): + """Set path to static lookup cache to use for lookup decoding (not updated by generation)""" + return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath) + + def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str): + """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)""" + return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath) + + def setEmbedding(self, embedding: bool): + """Whether to load model with embedding support""" + return self._set(embedding=embedding) + + def setFlashAttention(self, flashAttention: bool): + """Whether to enable Flash Attention""" + return self._set(flashAttention=flashAttention) + + def setInputPrefixBos(self, inputPrefixBos: bool): + """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` bool""" + return self._set(inputPrefixBos=inputPrefixBos) + + def setUseMmap(self, useMmap: bool): + """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)""" + return self._set(useMmap=useMmap) + + def setUseMlock(self, useMlock: bool): + """Whether to force the system to keep model in RAM rather than swapping or compressing""" + return self._set(useMlock=useMlock) + + def setNoKvOffload(self, noKvOffload: bool): + """Whether to disable KV offload""" + return self._set(noKvOffload=noKvOffload) + + def setSystemPrompt(self, systemPrompt: bool): + """Set a system prompt to use""" + return self._set(systemPrompt=systemPrompt) + + def setChatTemplate(self, chatTemplate: str): + """The chat template to use""" + return self._set(chatTemplate=chatTemplate) + + # -------- INFERENCE SETTERS -------- + def setInputPrefix(self, inputPrefix: str): + """Set the prompt to start generation with""" + return self._set(inputPrefix=inputPrefix) + + def setInputSuffix(self, inputSuffix: str): + """Set a suffix for infilling""" + return self._set(inputSuffix=inputSuffix) + + def setCachePrompt(self, cachePrompt: bool): + """Whether to remember the prompt to avoid reprocessing it""" + return self._set(cachePrompt=cachePrompt) + + def setNPredict(self, nPredict: int): + """Set the number of tokens to predict""" + return self._set(nPredict=nPredict) + + def setTopK(self, topK: int): + """Set top-k sampling""" + return self._set(topK=topK) + + def setTopP(self, topP: float): + """Set top-p sampling""" + return self._set(topP=topP) + + def setMinP(self, minP: float): + """Set min-p sampling""" + return self._set(minP=minP) + + def setTfsZ(self, tfsZ: float): + """Set tail free sampling, parameter z""" + return self._set(tfsZ=tfsZ) + + def setTypicalP(self, typicalP: float): + """Set locally typical sampling, parameter p""" + return self._set(typicalP=typicalP) + + def setTemperature(self, temperature: float): + """Set the temperature""" + return self._set(temperature=temperature) + + def setDynamicTemperatureRange(self, dynamicTemperatureRange: float): + """Set the dynamic temperature range""" + return self._set(dynamicTemperatureRange=dynamicTemperatureRange) + + def setDynamicTemperatureExponent(self, dynamicTemperatureExponent: float): + """Set the dynamic temperature exponent""" + return self._set(dynamicTemperatureExponent=dynamicTemperatureExponent) + + def setRepeatLastN(self, repeatLastN: int): + """Set the last n tokens to consider for penalties""" + return self._set(repeatLastN=repeatLastN) + + def setRepeatPenalty(self, repeatPenalty: float): + """Set the penalty of repeated sequences of tokens""" + return self._set(repeatPenalty=repeatPenalty) + + def setFrequencyPenalty(self, frequencyPenalty: float): + """Set the repetition alpha frequency penalty""" + return self._set(frequencyPenalty=frequencyPenalty) + + def setPresencePenalty(self, presencePenalty: float): + """Set the repetition alpha presence penalty""" + return self._set(presencePenalty=presencePenalty) + + def setMiroStat(self, miroStat: str): + """Set MiroStat sampling strategies.""" + return self._set(miroStat=miroStat) + + def setMiroStatTau(self, miroStatTau: float): + """Set the MiroStat target entropy, parameter tau""" + return self._set(miroStatTau=miroStatTau) + + def setMiroStatEta(self, miroStatEta: float): + """Set the MiroStat learning rate, parameter eta""" + return self._set(miroStatEta=miroStatEta) + + def setPenalizeNl(self, penalizeNl: bool): + """Whether to penalize newline tokens""" + return self._set(penalizeNl=penalizeNl) + + def setNKeep(self, nKeep: int): + """Set the number of tokens to keep from the initial prompt""" + return self._set(nKeep=nKeep) + + def setSeed(self, seed: int): + """Set the RNG seed""" + return self._set(seed=seed) + + def setNProbs(self, nProbs: int): + """Set the amount top tokens probabilities to output if greater than 0.""" + return self._set(nProbs=nProbs) + + def setMinKeep(self, minKeep: int): + """Set the amount of tokens the samplers should return at least (0 = disabled)""" + return self._set(minKeep=minKeep) + + def setGrammar(self, grammar: bool): + """Set BNF-like grammar to constrain generations""" + return self._set(grammar=grammar) + + def setPenaltyPrompt(self, penaltyPrompt: str): + """Override which part of the prompt is penalized for repetition.""" + return self._set(penaltyPrompt=penaltyPrompt) + + def setIgnoreEos(self, ignoreEos: bool): + """Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)""" + return self._set(ignoreEos=ignoreEos) + + def setDisableTokenIds(self, disableTokenIds: List[int]): + """Set the token ids to disable in the completion""" + return self._set(disableTokenIds=disableTokenIds) + + def setStopStrings(self, stopStrings: List[str]): + """Set strings upon seeing which token generation is stopped""" + return self._set(stopStrings=stopStrings) + + def setSamplers(self, samplers: List[str]): + """Set which samplers to use for token generation in the given order""" + return self._set(samplers=samplers) + + def setUseChatTemplate(self, useChatTemplate: bool): + """Set whether generate should apply a chat template""" + return self._set(useChatTemplate=useChatTemplate) + + # -------- JAVA SETTERS -------- + def setTokenIdBias(self, tokenIdBias: Dict[int, float]): + """Set token id bias""" + return self._call_java("setTokenIdBias", tokenIdBias) + + def setTokenBias(self, tokenBias: Dict[str, float]): + """Set token id bias""" + return self._call_java("setTokenBias", tokenBias) + + def setLoraAdapters(self, loraAdapters: Dict[str, float]): + """Set token id bias""" + return self._call_java("setLoraAdapters", loraAdapters) + + def getMetadata(self): + """Gets the metadata of the model""" + return self._call_java("getMetadata") + + @keyword_only + def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel", java_model=None): + super(AutoGGUFModel, self).__init__( + classname=classname, + java_model=java_model + ) + # self._setDefault() + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + AutoGGUFModel + The restored model + """ + from sparknlp.internal import _AutoGGUFLoader + jModel = _AutoGGUFLoader(folder, spark_session._jsparkSession)._java_obj + return AutoGGUFModel(java_model=jModel) + + @staticmethod + def pretrained(name="gguf-phi3-mini-4k-instruct-q4", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "gguf-phi3-mini-4k-instruct-q4" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + AutoGGUFModel + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + return ResourceDownloader.downloadModel(AutoGGUFModel, name, lang, remote_loc) diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index 2c3ece653f7f3d..adf19667279fbe 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -974,6 +974,12 @@ def __init__(self, path, jspark): ) +class _AutoGGUFLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_AutoGGUFLoader, self).__init__( + "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel.loadSavedModel", path, jspark) + + class _MxbaiEmbeddingsLoader(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_MxbaiEmbeddingsLoader, self).__init__( @@ -986,4 +992,3 @@ def __init__(self, path, jspark): super(_SnowFlakeEmbeddingsLoader, self).__init__( "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark ) - diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py new file mode 100644 index 00000000000000..0c2d4a349ca806 --- /dev/null +++ b/python/test/annotator/seq2seq/auto_gguf_model_test.py @@ -0,0 +1,194 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class AutoGGUFModelTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + + def runTest(self): + data = ( + self.spark.createDataFrame( + [ + ["The moons of Jupiter are "], + ["Earth is "], + ["The moon is "], + ["The sun is "], + ] + ) + .toDF("text") + .repartition(1) + ) + + document_assembler = ( + DocumentAssembler().setInputCol("text").setOutputCol("document") + ) + + modelPath = "models/codellama-7b.Q2_K.gguf" + model = ( + AutoGGUFModel.loadSavedModel(modelPath, self.spark) + .setInputCols("document") + .setOutputCol("completions") + .setBatchSize(4) + .setNPredict(20) + .setNGpuLayers(99) + .setTemperature(0.4) + .setTopK(40) + .setTopP(0.9) + .setPenalizeNl(True) + ) + + pipeline = Pipeline().setStages([document_assembler, model]) + results = pipeline.fit(data).transform(data) + + results.select("completions").show(truncate=False) + + +@pytest.mark.slow +class AutoGGUFModelParametersTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + + def runTest(self): + data = ( + self.spark.createDataFrame([["The moons of Jupiter are "]]) + .toDF("text") + .repartition(1) + ) + + document_assembler = ( + DocumentAssembler().setInputCol("text").setOutputCol("document") + ) + + modelPath = "models/codellama-7b.Q2_K.gguf" + model = ( + AutoGGUFModel.loadSavedModel(modelPath, self.spark) + .setInputCols("document") + .setOutputCol("completions") + .setBatchSize(4) + ) + + # Model Parameters + model.setNThreads(8) + model.setNThreadsDraft(8) + model.setNThreadsBatch(8) + model.setNThreadsBatchDraft(8) + model.setNCtx(512) + model.setNBatch(32) + model.setNUbatch(32) + model.setNDraft(5) + model.setNChunks(-1) + model.setNSequences(1) + model.setPSplit(0.1) + model.setNGpuLayers(99) + model.setNGpuLayersDraft(99) + model.setGpuSplitMode("NONE") + model.setMainGpu(0) + model.setTensorSplit([]) + model.setNBeams(0) + model.setGrpAttnN(1) + model.setGrpAttnW(512) + model.setRopeFreqBase(1.0) + model.setRopeFreqScale(1.0) + model.setYarnExtFactor(1.0) + model.setYarnAttnFactor(1.0) + model.setYarnBetaFast(32.0) + model.setYarnBetaSlow(1.0) + model.setYarnOrigCtx(0) + model.setDefragmentationThreshold(-1.0) + model.setNumaStrategy("DISTRIBUTE") + model.setRopeScalingType("UNSPECIFIED") + model.setPoolingType("UNSPECIFIED") + model.setModelDraft("") + model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") + model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") + model.setLoraBase("") + model.setEmbedding(False) + model.setFlashAttention(False) + model.setInputPrefixBos(False) + model.setUseMmap(False) + model.setUseMlock(False) + model.setNoKvOffload(False) + model.setSystemPrompt("") + model.setChatTemplate("") + + # Inference Parameters + model.setInputPrefix("") + model.setInputSuffix("") + model.setCachePrompt(False) + model.setNPredict(-1) + model.setTopK(40) + model.setTopP(0.9) + model.setMinP(0.1) + model.setTfsZ(1.0) + model.setTypicalP(1.0) + model.setTemperature(0.8) + model.setDynamicTemperatureRange(0.0) + model.setDynamicTemperatureExponent(1.0) + model.setRepeatLastN(64) + model.setRepeatPenalty(1.0) + model.setFrequencyPenalty(0.0) + model.setPresencePenalty(0.0) + model.setMiroStat("DISABLED") + model.setMiroStatTau(5.0) + model.setMiroStatEta(0.1) + model.setPenalizeNl(False) + model.setNKeep(0) + model.setSeed(-1) + model.setNProbs(0) + model.setMinKeep(0) + model.setGrammar("") + model.setPenaltyPrompt("") + model.setIgnoreEos(False) + model.setDisableTokenIds([]) + model.setStopStrings([]) + model.setUseChatTemplate(False) + model.setNPredict(2) + model.setSamplers(["TOP_P", "TOP_K"]) + + # Special PySpark Parameters (Scala StructFeatures) + model.setTokenIdBias({0: 0.0, 1: 0.0}) + model.setTokenBias({"!": 0.0, "?": 0.0}) + model.setLoraAdapters({" ": 0.0}) + + pipeline = Pipeline().setStages([document_assembler, model]) + results = pipeline.fit(data).transform(data) + + results.select("completions").show(truncate=False) + + +@pytest.mark.slow +class AutoGGUFModelMetadataTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + + def runTest(self): + modelPath = "models/codellama-7b.Q2_K.gguf" + model = ( + AutoGGUFModel.loadSavedModel(modelPath, self.spark) + .setInputCols("document") + .setOutputCol("completions") + ) + + metadata = model.getMetadata() + assert len(metadata) > 0 + print(eval(metadata)) diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties index 703f281a1da1d1..6a17297f6fda41 100644 --- a/src/main/resources/log4j.properties +++ b/src/main/resources/log4j.properties @@ -1,4 +1,4 @@ -log4j.rootLogger=WARN, STDOUT +log4j.rootLogger=DEBUG, STDOUT log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n diff --git a/src/main/resources/log4j2.properties b/src/main/resources/log4j2.properties index 703f281a1da1d1..5824b6f6cf7dab 100644 --- a/src/main/resources/log4j2.properties +++ b/src/main/resources/log4j2.properties @@ -1,4 +1,4 @@ -log4j.rootLogger=WARN, STDOUT +log4j.rootLogger=debug, STDOUT log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala new file mode 100644 index 00000000000000..495e8cb2a6b0f9 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala @@ -0,0 +1,92 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.ml.gguf + +import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters} +import org.apache.spark.SparkFiles +import org.apache.spark.sql.SparkSession +import org.slf4j.{Logger, LoggerFactory} + +import java.io.File +import java.nio.file.{Files, Paths} + +class GGUFWrapper(var modelFileName: String, var modelFolder: String) extends Serializable { + + /** For Deserialization */ + def this() = { + this(null, null) + } + + // Important for serialization on none-kryo serializers + @transient private var llamaModel: LlamaModel = _ + + def getSession(modelParameters: ModelParameters): LlamaModel = + this.synchronized { + if (llamaModel == null) { + // TODO: Validate when modelFileName or tmpFolder is None?? + val modelFilePath = SparkFiles.get(modelFileName) + + if (Paths.get(modelFilePath).toFile.exists()) { + modelParameters.setModelFilePath(modelFilePath) + llamaModel = GGUFWrapper.withSafeGGUFModelLoader(modelParameters) + } else + throw new IllegalStateException( + s"Model file $modelFileName does not exist in SparkFiles.") + } + // TODO: if the model is already loaded then the model parameters will not apply. perhaps output a logline here. + llamaModel + } + + def saveToFile(file: String): Unit = { + val modelFilePath = SparkFiles.get(modelFileName) + val modelOutputPath = Paths.get(file, modelFileName) + Files.copy(Paths.get(modelFilePath), modelOutputPath) + } + + // Destructor to free the model when this object is garbage collected + override def finalize(): Unit = { + if (llamaModel != null) { + llamaModel.close() + } + } + +} + +/** Companion object */ +object GGUFWrapper { + private[GGUFWrapper] val logger: Logger = LoggerFactory.getLogger("GGUFWrapper") + + // TODO: make sure this.synchronized is needed or it's not a bottleneck + private def withSafeGGUFModelLoader(modelParameters: ModelParameters): LlamaModel = + this.synchronized { + new LlamaModel(modelParameters) // TODO: Model parameters + } + + def read(sparkSession: SparkSession, modelPath: String): GGUFWrapper = { + // TODO Better Sanity Check + val modelFile = new File(modelPath) + val modelFileExist: Boolean = modelFile.exists() + + if (!modelFile.getName.endsWith(".gguf")) + throw new IllegalArgumentException(s"Model file $modelPath is not a GGUF model file") + + if (modelFileExist) { + sparkSession.sparkContext.addFile(modelPath) + } else throw new IllegalArgumentException(s"Model file $modelPath does not exist") + + new GGUFWrapper(modelFile.getName, modelFile.getParent) + } +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala new file mode 100644 index 00000000000000..e6d832eef9a79f --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala @@ -0,0 +1,1292 @@ +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel +import com.johnsnowlabs.nlp.llama.args._ +import com.johnsnowlabs.nlp.llama.{InferenceParameters, ModelParameters} +import com.johnsnowlabs.nlp.serialization.StructFeature +import org.apache.spark.ml.param._ +import org.slf4j.LoggerFactory + +import scala.collection.mutable +import scala.jdk.CollectionConverters._ + +/** Contains settable parameters for the [[AutoGGUFModel]]. + * + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupprio setParam 1 + * @groupprio getParam 2 + * @groupprio param 3 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +trait HasLlamaCppProperties { + this: ParamsAndFeaturesWritable with HasProtectedParams => + val logger = LoggerFactory.getLogger(this.getClass) + // ---------------- MODEL PARAMETERS ---------------- + /** @group param */ + val nThreads = + new IntParam(this, "nThreads", "Set the number of threads to use during generation") + + /** @group param */ + val nThreadsDraft = new IntParam( + this, + "nThreadsDraft", + "Set the number of threads to use during draft generation") + + /** @group param */ + val nThreadsBatch = new IntParam( + this, + "nThreadsBatch", + "Set the number of threads to use during batch and prompt processing") + + /** @group param */ + val nThreadsBatchDraft = new IntParam( + this, + "nThreadsBatchDraft", + "Set the number of threads to use during batch and prompt processing") + + /** @group param */ + val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context") + + /** @group param */ + val nBatch = new IntParam( + this, + "nBatch", + "Set the logical batch size for prompt processing (must be >=32 to use BLAS)") + + /** @group param */ + val nUbatch = new IntParam( + this, + "nUbatch", + "Set the physical batch size for prompt processing (must be >=32 to use BLAS)") + + /** @group param */ + val nDraft = + new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding") + + /** @group param */ + val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process") + + /** @group param */ + val nSequences = + new IntParam(this, "nSequences", "Set the number of sequences to decode") + + /** @group param */ + val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability") + + /** @group param */ + val nGpuLayers = new IntParam( + this, + "nGpuLayers", + "Set the number of layers to store in VRAM (-1 - use default)") + + /** @group param */ + val nGpuLayersDraft = new IntParam( + this, + "nGpuLayersDraft", + "Set the number of layers to store in VRAM for the draft model (-1 - use default)") + + /** Set how to split the model across GPUs + * + * - NONE: No GPU split + * - LAYER: Split the model across GPUs by layer + * - ROW: Split the model across GPUs by rows + * + * @group param + */ + val gpuSplitMode = + new Param[String](this, "gpuSplitMode", "Set how to split the model across GPUs") + + /** @group param */ + val mainGpu = + new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.") + + /** @group param */ + val tensorSplit = new DoubleArrayParam( + this, + "tensorSplit", + "Set how split tensors should be distributed across GPUs") + + /** @group param */ + val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor") + + /** @group param */ + val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width") + + /** @group param */ + val ropeFreqBase = + new FloatParam(this, "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling") + + /** @group param */ + val ropeFreqScale = new FloatParam( + this, + "ropeFreqScale", + "Set the RoPE frequency scaling factor, expands context by a factor of 1/N") + + /** @group param */ + val yarnExtFactor = + new FloatParam(this, "yarnExtFactor", "Set the YaRN extrapolation mix factor") + + /** @group param */ + val yarnAttnFactor = + new FloatParam(this, "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude") + + /** @group param */ + val yarnBetaFast = + new FloatParam(this, "yarnBetaFast", "Set the YaRN low correction dim or beta") + + /** @group param */ + val yarnBetaSlow = + new FloatParam(this, "yarnBetaSlow", "Set the YaRN high correction dim or alpha") + + /** @group param */ + val yarnOrigCtx = + new IntParam(this, "yarnOrigCtx", "Set the YaRN original context size of model") + + /** @group param */ + val defragmentationThreshold = + new FloatParam(this, "defragmentationThreshold", "Set the KV cache defragmentation threshold") + + /** Set optimization strategies that help on some NUMA systems (if available) + * + * Available Strategies: + * + * - DISABLED: No NUMA optimizations + * - DISTRIBUTE: Spread execution evenly over all + * - ISOLATE: Only spawn threads on CPUs on the node that execution started on + * - NUMA_CTL: Use the CPU map provided by numactl + * - MIRROR: Mirrors the model across NUMA nodes + * + * @group param + */ + val numaStrategy = new Param[String]( + this, + "numaStrategy", + "Set optimization strategies that help on some NUMA systems (if available)") + + /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + * + * - UNSPECIFIED: Don't use any scaling + * - LINEAR: Linear scaling + * - YARN: YaRN RoPE scaling + * @group param + */ + val ropeScalingType = new Param[String]( + this, + "ropeScalingType", + "Set the RoPE frequency scaling method, defaults to linear unless specified by the model") + + /** Set the pooling type for embeddings, use model default if unspecified + * + * - 0 UNSPECIFIED: Don't use any pooling + * - 1 MEAN: Mean Pooling + * - 2 CLS: CLS Pooling + * + * @group param + */ + val poolingType = new Param[String]( + this, + "poolingType", + "Set the pooling type for embeddings, use model default if unspecified") + // model = new Param[String](this, "model", "Set the model file path to load") + /** @group param */ + val modelDraft = + new Param[String](this, "modelDraft", "Set the draft model for speculative decoding") + + // modelAlias = new Param[String](this, "modelAlias", "Set a model alias") + /** @group param */ + val lookupCacheStaticFilePath = new Param[String]( + this, + "lookupCacheStaticFilePath", + "Set path to static lookup cache to use for lookup decoding (not updated by generation)") + + /** @group param */ + val lookupCacheDynamicFilePath = new Param[String]( + this, + "lookupCacheDynamicFilePath", + "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)") + + /** @group param */ + val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") + + val embedding = + new BooleanParam(this, "embedding", "Whether to load model with embedding support") + + /** @group param */ + val flashAttention = + new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention") + + /** @group param */ + val inputPrefixBos = new BooleanParam( + this, + "inputPrefixBos", + "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string") + + /** @group param */ + val useMmap = new BooleanParam( + this, + "useMmap", + "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)") + + /** @group param */ + val useMlock = new BooleanParam( + this, + "useMlock", + "Whether to force the system to keep model in RAM rather than swapping or compressing") + + /** @group param */ + val noKvOffload = new BooleanParam(this, "noKvOffload", "Whether to disable KV offload") + + /** @group param */ + val systemPrompt = new Param[String](this, "systemPrompt", "Set a system prompt to use") + + /** @group param */ + val chatTemplate = + new Param[String](this, "chatTemplate", "The chat template to use") + + /** Set the number of threads to use during generation + * + * @group setParam + */ + def setNThreads(nThreads: Int): this.type = { set(this.nThreads, nThreads) } + + /** Set the number of threads to use during draft generation + * + * @group setParam + */ + def setNThreadsDraft(nThreadsDraft: Int): this.type = { set(this.nThreadsDraft, nThreadsDraft) } + + /** Set the number of threads to use during batch and prompt processing + * + * @group setParam + */ + def setNThreadsBatch(nThreadsBatch: Int): this.type = { set(this.nThreadsBatch, nThreadsBatch) } + + /** Set the number of threads to use during batch and prompt processing + * + * @group setParam + */ + def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = { + set(this.nThreadsBatchDraft, nThreadsBatchDraft) + } + + /** Set the size of the prompt context + * + * @group setParam + */ + def setNCtx(nCtx: Int): this.type = { set(this.nCtx, nCtx) } + + /** Set the logical batch size for prompt processing (must be >=32 to use BLAS) + * + * @group setParam + */ + def setNBatch(nBatch: Int): this.type = { set(this.nBatch, nBatch) } + + /** Set the physical batch size for prompt processing (must be >=32 to use BLAS) + * + * @group setParam + */ + def setNUbatch(nUbatch: Int): this.type = { set(this.nUbatch, nUbatch) } + + /** Set the number of tokens to draft for speculative decoding + * + * @group setParam + */ + def setNDraft(nDraft: Int): this.type = { set(this.nDraft, nDraft) } + + /** Set the maximal number of chunks to process + * + * @group setParam + */ + def setNChunks(nChunks: Int): this.type = { set(this.nChunks, nChunks) } + + /** Set the number of sequences to decode + * + * @group setParam + */ + def setNSequences(nSequences: Int): this.type = { set(this.nSequences, nSequences) } + + /** Set the speculative decoding split probability + * + * @group setParam + */ + def setPSplit(pSplit: Float): this.type = { set(this.pSplit, pSplit) } + + /** Set the number of layers to store in VRAM (-1 - use default) + * + * @group setParam + */ + def setNGpuLayers(nGpuLayers: Int): this.type = { set(this.nGpuLayers, nGpuLayers) } + + /** Set the number of layers to store in VRAM for the draft model (-1 - use default) + * + * @group setParam + */ + def setNGpuLayersDraft(nGpuLayersDraft: Int): this.type = { + set(this.nGpuLayersDraft, nGpuLayersDraft) + } + + /** Set how to split the model across GPUs + * + * - NONE: No GPU split + * -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows + * + * @group setParam + */ + def setGpuSplitMode(splitMode: String): this.type = { set(this.gpuSplitMode, splitMode) } + + /** Set the GPU that is used for scratch and small tensors + * + * @group setParam + */ + def setMainGpu(mainGpu: Int): this.type = { set(this.mainGpu, mainGpu) } + + /** Set how split tensors should be distributed across GPUs + * + * @group setParam + */ + def setTensorSplit(tensorSplit: Array[Double]): this.type = { + set(this.tensorSplit, tensorSplit) + } + + /** Set the group-attention factor + * + * @group setParam + */ + def setGrpAttnN(grpAttnN: Int): this.type = { set(this.grpAttnN, grpAttnN) } + + /** Set the group-attention width + * + * @group setParam + */ + def setGrpAttnW(grpAttnW: Int): this.type = { set(this.grpAttnW, grpAttnW) } + + /** Set the RoPE base frequency, used by NTK-aware scaling + * + * @group setParam + */ + def setRopeFreqBase(ropeFreqBase: Float): this.type = { set(this.ropeFreqBase, ropeFreqBase) } + + /** Set the RoPE frequency scaling factor, expands context by a factor of 1/N + * + * @group setParam + */ + def setRopeFreqScale(ropeFreqScale: Float): this.type = { + set(this.ropeFreqScale, ropeFreqScale) + } + + /** Set the YaRN extrapolation mix factor + * + * @group setParam + */ + def setYarnExtFactor(yarnExtFactor: Float): this.type = { + set(this.yarnExtFactor, yarnExtFactor) + } + + /** Set the YaRN scale sqrt(t) or attention magnitude + * + * @group setParam + */ + def setYarnAttnFactor(yarnAttnFactor: Float): this.type = { + set(this.yarnAttnFactor, yarnAttnFactor) + } + + /** Set the YaRN low correction dim or beta + * + * @group setParam + */ + def setYarnBetaFast(yarnBetaFast: Float): this.type = { set(this.yarnBetaFast, yarnBetaFast) } + + /** Set the YaRN high correction dim or alpha + * + * @group setParam + */ + def setYarnBetaSlow(yarnBetaSlow: Float): this.type = { set(this.yarnBetaSlow, yarnBetaSlow) } + + /** Set the YaRN original context size of model + * + * @group setParam + */ + def setYarnOrigCtx(yarnOrigCtx: Int): this.type = { set(this.yarnOrigCtx, yarnOrigCtx) } + + /** Set the KV cache defragmentation threshold + * + * @group setParam + */ + def setDefragmentationThreshold(defragThold: Float): this.type = { + set(this.defragmentationThreshold, defragThold) + } + + /** Set optimization strategies that help on some NUMA systems (if available) + * + * Available Strategies: + * + * - DISABLED: No NUMA optimizations + * - DISTRIBUTE: spread execution evenly over all + * - ISOLATE: only spawn threads on CPUs on the node that execution started on + * - NUMA_CTL: use the CPU map provided by numactl + * - MIRROR: Mirrors the model across NUMA nodes + * + * @group setParam + */ + def setNumaStrategy(numa: String): this.type = { set(this.numaStrategy, numa) } + + /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + * + * - UNSPECIFIED: Don't use any scaling + * - LINEAR: Linear scaling + * - YARN: YaRN RoPE scaling + * @group setParam + */ + def setRopeScalingType(ropeScalingType: String): this.type = { + set(this.ropeScalingType, ropeScalingType) + } + + /** Set the pooling type for embeddings, use model default if unspecified + * + * - UNSPECIFIED: Don't use any pooling + * - MEAN: Mean Pooling + * - CLS: CLS Pooling + * + * @group setParam + */ + def setPoolingType(poolingType: String): this.type = { set(this.poolingType, poolingType) } + + /** Set the draft model for speculative decoding + * + * @group setParam + */ + def setModelDraft(modelDraft: String): this.type = { set(this.modelDraft, modelDraft) } + + /** Set a model alias + * + * @group setParam + */ + def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = { + set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) + } + + /** Set a model alias + * + * @group setParam + */ + def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = { + set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) + } + + /** Sets paths to lora adapters with user defined scale. + * + * @group setParam + */ + def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = { + set(this.loraAdapters, loraAdapters) + } + + /** Sets paths to lora adapters with user defined scale. (PySpark Override) + * + * @group setParam + */ + def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = { + val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() } + set(this.loraAdapters, scalaLoraAdapters.toMap) + } + + /** Whether to load model with embedding support + * + * @group setParam + */ + def setEmbedding(embedding: Boolean): this.type = { set(this.embedding, embedding) } + + /** Whether to enable Flash Attention + * + * @group setParam + */ + def setFlashAttention(flashAttention: Boolean): this.type = { + set(this.flashAttention, flashAttention) + } + + /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string + * + * @group setParam + */ + def setInputPrefixBos(inputPrefixBos: Boolean): this.type = { + set(this.inputPrefixBos, inputPrefixBos) + } + + /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock) + * + * @group setParam + */ + def setUseMmap(useMmap: Boolean): this.type = { set(this.useMmap, useMmap) } + + /** Whether to force the system to keep model in RAM rather than swapping or compressing + * + * @group setParam + */ + def setUseMlock(useMlock: Boolean): this.type = { set(this.useMlock, useMlock) } + + /** Whether to disable KV offload + * + * @group setParam + */ + def setNoKvOffload(noKvOffload: Boolean): this.type = { set(this.noKvOffload, noKvOffload) } + + /** Set a system prompt to use + * + * @group setParam + */ + def setSystemPrompt(systemPrompt: String): this.type = { set(this.systemPrompt, systemPrompt) } + + /** The chat template to use + * + * @group setParam + */ + def setChatTemplate(chatTemplate: String): this.type = { set(this.chatTemplate, chatTemplate) } + + // ---------------- GETTERS ---------------- + /** @group getParam */ + def getNThreads: Int = $(nThreads) + + /** @group getParam */ + def getNThreadsDraft: Int = $(nThreadsDraft) + + /** @group getParam */ + def getNThreadsBatch: Int = $(nThreadsBatch) + + /** @group getParam */ + def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft) + + /** @group getParam */ + def getNCtx: Int = $(nCtx) + + /** @group getParam */ + def getNBatch: Int = $(nBatch) + + /** @group getParam */ + def getNUbatch: Int = $(nUbatch) + + /** @group getParam */ + def getNDraft: Int = $(nDraft) + + /** @group getParam */ + def getNChunks: Int = $(nChunks) + + /** @group getParam */ + def getNSequences: Int = $(nSequences) + + /** @group getParam */ + def getPSplit: Float = $(pSplit) + + /** @group getParam */ + def getNGpuLayers: Int = $(nGpuLayers) + + /** @group getParam */ + def getNGpuLayersDraft: Int = $(nGpuLayersDraft) + + /** @group getParam */ + def getSplitMode: String = $(gpuSplitMode) + + /** @group getParam */ + def getMainGpu: Int = $(mainGpu) + + /** @group getParam */ + def getTensorSplit: Array[Double] = $(tensorSplit) + + def getGrpAttnN: Int = $(grpAttnN) + + /** @group getParam */ + def getGrpAttnW: Int = $(grpAttnW) + + /** @group getParam */ + def getRopeFreqBase: Float = $(ropeFreqBase) + + /** @group getParam */ + def getRopeFreqScale: Float = $(ropeFreqScale) + + /** @group getParam */ + def getYarnExtFactor: Float = $(yarnExtFactor) + + /** @group getParam */ + def getYarnAttnFactor: Float = $(yarnAttnFactor) + + /** @group getParam */ + def getYarnBetaFast: Float = $(yarnBetaFast) + + /** @group getParam */ + def getYarnBetaSlow: Float = $(yarnBetaSlow) + + /** @group getParam */ + def getYarnOrigCtx: Int = $(yarnOrigCtx) + + /** @group getParam */ + def getDefragmentationThreshold: Float = $(defragmentationThreshold) + + /** @group getParam */ + def getNuma: String = $(numaStrategy) + + /** @group getParam */ + def getRopeScalingType: String = $(ropeScalingType) + + /** @group getParam */ + def getPoolingType: String = $(poolingType) + + /** @group getParam */ + def getModelDraft: String = $(modelDraft) + + /** @group getParam */ + def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath) + + /** @group getParam */ + def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath) + + /** @group getParam */ + def getLoraAdapters: Map[String, Float] = $$(loraAdapters) + + /** @group getParam */ + def getEmbedding: Boolean = $(embedding) + + /** @group getParam */ + def getFlashAttention: Boolean = $(flashAttention) + + /** @group getParam */ + def getInputPrefixBos: Boolean = $(inputPrefixBos) + + /** @group getParam */ + def getUseMmap: Boolean = $(useMmap) + + /** @group getParam */ + def getUseMlock: Boolean = $(useMlock) + + /** @group getParam */ + def getNoKvOffload: Boolean = $(noKvOffload) + + /** @group getParam */ + def getSystemPrompt: String = $(systemPrompt) + + /** @group getParam */ + def getChatTemplate: String = $(chatTemplate) + + // ---------------- INFERENCE PARAMETERS ---------------- + /** @group param */ + val inputPrefix = + new Param[String](this, "inputPrefix", "Set the prompt to start generation with") + + /** @group param */ + val inputSuffix = + new Param[String](this, "inputSuffix", "Set a suffix for infilling") + + /** @group param */ + val cachePrompt = new BooleanParam( + this, + "cachePrompt", + "Whether to remember the prompt to avoid reprocessing it") + + /** @group param */ + val nPredict = new IntParam(this, "nPredict", "Set the number of tokens to predict") + + /** @group param */ + val topK = new IntParam(this, "topK", "Set top-k sampling") + + /** @group param */ + val topP = new FloatParam(this, "topP", "Set top-p sampling") + + /** @group param */ + val minP = new FloatParam(this, "minP", "Set min-p sampling") + + /** @group param */ + val tfsZ = new FloatParam(this, "tfsZ", "Set tail free sampling, parameter z") + + /** @group param */ + val typicalP = new FloatParam(this, "typicalP", "Set locally typical sampling, parameter p") + + /** @group param */ + val temperature = new FloatParam(this, "temperature", "Set the temperature") + + /** @group param */ + val dynamicTemperatureRange = + new FloatParam(this, "dynatempRange", "Set the dynamic temperature range") + + /** @group param */ + val dynamicTemperatureExponent = + new FloatParam(this, "dynatempExponent", "Set the dynamic temperature exponent") + + /** @group param */ + val repeatLastN = + new IntParam(this, "repeatLastN", "Set the last n tokens to consider for penalties") + + /** @group param */ + val repeatPenalty = + new FloatParam(this, "repeatPenalty", "Set the penalty of repeated sequences of tokens") + + /** @group param */ + val frequencyPenalty = + new FloatParam(this, "frequencyPenalty", "Set the repetition alpha frequency penalty") + + /** @group param */ + val presencePenalty = + new FloatParam(this, "presencePenalty", "Set the repetition alpha presence penalty") + + /** @group param */ + val miroStat = new Param[String](this, "miroStat", "Set MiroStat sampling strategies.") + + /** @group param */ + val miroStatTau = + new FloatParam(this, "mirostatTau", "Set the MiroStat target entropy, parameter tau") + + /** @group param */ + val miroStatEta = + new FloatParam(this, "mirostatEta", "Set the MiroStat learning rate, parameter eta") + + /** @group param */ + val penalizeNl = new BooleanParam(this, "penalizeNl", "Whether to penalize newline tokens") + + /** @group param */ + val nKeep = + new IntParam(this, "nKeep", "Set the number of tokens to keep from the initial prompt") + + /** @group param */ + val seed = new IntParam(this, "seed", "Set the RNG seed") + + /** @group param */ + val nProbs = new IntParam( + this, + "nProbs", + "Set the amount top tokens probabilities to output if greater than 0.") + + /** @group param */ + val minKeep = new IntParam( + this, + "minKeep", + "Set the amount of tokens the samplers should return at least (0 = disabled)") + + /** @group param */ + val grammar = + new Param[String](this, "grammar", "Set BNF-like grammar to constrain generations") + + /** @group param */ + val penaltyPrompt = new Param[String]( + this, + "penaltyPrompt", + "Override which part of the prompt is penalized for repetition.") + + /** @group param */ + val ignoreEos = new BooleanParam( + this, + "ignoreEos", + "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)") + + // Modify the likelihood of tokens appearing in the completion by their id. + val tokenIdBias: StructFeature[Map[Int, Float]] = + new StructFeature[Map[Int, Float]](this, "tokenIdBias") + + // Modify the likelihood of tokens appearing in the completion by their string. + /** @group param */ + val tokenBias: StructFeature[Map[String, Float]] = + new StructFeature[Map[String, Float]](this, "tokenBias") + + /** @group param */ + val disableTokenIds = + new IntArrayParam(this, "disableTokenIds", "Set the token ids to disable in the completion") + + /** @group param */ + val stopStrings = new StringArrayParam( + this, + "stopStrings", + "Set strings upon seeing which token generation is stopped") + + /** @group param */ + val samplers = new StringArrayParam( + this, + "samplers", + "Set which samplers to use for token generation in the given order") + + /** @group param */ + val useChatTemplate = new BooleanParam( + this, + "useChatTemplate", + "Set whether or not generate should apply a chat template") + + /** Set the prompt to start generation with + * + * @group setParam + */ + def setInputPrefix(inputPrefix: String): this.type = { set(this.inputPrefix, inputPrefix) } + + /** Set a suffix for infilling + * + * @group setParam + */ + def setInputSuffix(inputSuffix: String): this.type = { set(this.inputSuffix, inputSuffix) } + + /** Whether to remember the prompt to avoid reprocessing it + * + * @group setParam + */ + def setCachePrompt(cachePrompt: Boolean): this.type = { set(this.cachePrompt, cachePrompt) } + + /** Set the number of tokens to predict + * + * @group setParam + */ + def setNPredict(nPredict: Int): this.type = { set(this.nPredict, nPredict) } + + /** Set top-k sampling + * + * @group setParam + */ + def setTopK(topK: Int): this.type = { set(this.topK, topK) } + + /** Set top-p sampling + * + * @group setParam + */ + def setTopP(topP: Float): this.type = { set(this.topP, topP) } + + /** Set min-p sampling + * + * @group setParam + */ + def setMinP(minP: Float): this.type = { set(this.minP, minP) } + + /** Set tail free sampling, parameter z + * @group setParam + */ + def setTfsZ(tfsZ: Float): this.type = { set(this.tfsZ, tfsZ) } + + /** Set locally typical sampling, parameter p + * + * @group setParam + */ + def setTypicalP(typicalP: Float): this.type = { set(this.typicalP, typicalP) } + + /** Set the temperature + * + * @group setParam + */ + def setTemperature(temperature: Float): this.type = { set(this.temperature, temperature) } + + /** Set the dynamic temperature range + * + * @group setParam + */ + def setDynamicTemperatureRange(dynatempRange: Float): this.type = { + set(this.dynamicTemperatureRange, dynatempRange) + } + + /** Set the dynamic temperature exponent + * + * @group setParam + */ + def setDynamicTemperatureExponent(dynatempExponent: Float): this.type = { + set(this.dynamicTemperatureExponent, dynatempExponent) + } + + /** Set the last n tokens to consider for penalties + * + * @group setParam + */ + def setRepeatLastN(repeatLastN: Int): this.type = { set(this.repeatLastN, repeatLastN) } + + /** Set the penalty of repeated sequences of tokens + * + * @group setParam + */ + def setRepeatPenalty(repeatPenalty: Float): this.type = { + set(this.repeatPenalty, repeatPenalty) + } + + /** Set the repetition alpha frequency penalty + * + * @group setParam + */ + def setFrequencyPenalty(frequencyPenalty: Float): this.type = { + set(this.frequencyPenalty, frequencyPenalty) + } + + /** Set the repetition alpha presence penalty + * + * @group setParam + */ + def setPresencePenalty(presencePenalty: Float): this.type = { + set(this.presencePenalty, presencePenalty) + } + + /** Set MiroStat sampling strategies. + * + * - DISABLED: No MiroStat + * - V1: MiroStat V1 + * - V2: MiroStat V2 + * + * @group setParam + */ + def setMiroStat(mirostat: String): this.type = set(this.miroStat, mirostat) + + /** Set the MiroStat target entropy, parameter tau + * + * @group setParam + */ + def setMiroStatTau(mirostatTau: Float): this.type = { set(this.miroStatTau, mirostatTau) } + + /** Set the MiroStat learning rate, parameter eta + * + * @group setParam + */ + def setMiroStatEta(mirostatEta: Float): this.type = { set(this.miroStatEta, mirostatEta) } + + /** Set whether to penalize newline tokens + * + * @group setParam + */ + def setPenalizeNl(penalizeNl: Boolean): this.type = { set(this.penalizeNl, penalizeNl) } + + /** Set the number of tokens to keep from the initial prompt + * + * @group setParam + */ + def setNKeep(nKeep: Int): this.type = { set(this.nKeep, nKeep) } + + /** Set the RNG seed + * + * @group setParam + */ + def setSeed(seed: Int): this.type = { set(this.seed, seed) } + + /** Set the amount top tokens probabilities to output if greater than 0. + * + * @group setParam + */ + def setNProbs(nProbs: Int): this.type = { set(this.nProbs, nProbs) } + + /** Set the amount of tokens the samplers should return at least (0 = disabled) + * + * @group setParam + */ + def setMinKeep(minKeep: Int): this.type = { set(this.minKeep, minKeep) } + + /** Set BNF-like grammar to constrain generations + * + * @group setParam + */ + def setGrammar(grammar: String): this.type = { set(this.grammar, grammar) } + + /** Override which part of the prompt is penalized for repetition. + * + * @group setParam + */ + def setPenaltyPrompt(penaltyPrompt: String): this.type = { + set(this.penaltyPrompt, penaltyPrompt) + } + + /** Set whether to ignore end of stream token and continue generating (implies --logit-bias + * 2-inf) + * + * @group setParam + */ + def setIgnoreEos(ignoreEos: Boolean): this.type = { set(this.ignoreEos, ignoreEos) } + + /** Set the tokens to disable during completion. + * + * @group setParam + */ + def setTokenBias(tokenBias: Map[String, Float]): this.type = { + set(this.tokenBias, tokenBias) + } + + /** Set the tokens to disable during completion. (Override for PySpark) + * + * @group setParam + */ + def setTokenBias(tokenBias: java.util.HashMap[String, java.lang.Double]): this.type = { + val scalaTokenBias = tokenBias.asScala.map { case (k, v) => k -> v.floatValue() } + set(this.tokenBias, scalaTokenBias.toMap) + } + + /** Set the token ids to disable in the completion. + * + * @group setParam + */ + def setTokenIdBias(tokenIdBias: Map[Int, Float]): this.type = { + set(this.tokenIdBias, tokenIdBias) + } + + /** Set the token ids to disable in the completion. (Override for PySpark) + * + * @group setParam + */ + def setTokenIdBias(tokenIdBias: java.util.HashMap[Integer, java.lang.Double]): this.type = { + val scalaTokenIdBias = tokenIdBias.asScala.map { case (k, v) => k.toInt -> v.toFloat } + set(this.tokenIdBias, scalaTokenIdBias.toMap) + } + + /** Set the token ids to disable in the completion. This corresponds to `setTokenBias` with a + * value of `Float.NEGATIVE_INFINITY`. + * + * @group setParam + */ + def setDisableTokenIds(disableTokenIds: Array[Int]): this.type = { + set(this.disableTokenIds, disableTokenIds) + } + + /** Set strings upon seeing which token generation is stopped + * + * @group setParam + */ + def setStopStrings(stopStrings: Array[String]): this.type = { + set(this.stopStrings, stopStrings) + } + + /** Set which samplers to use for token generation in the given order . + * + * Available Samplers are: + * + * - TOP_K: Top-k sampling + * - TFS_Z: Tail free sampling + * - TYPICAL_P: Locally typical sampling p + * - TOP_P: Top-p sampling + * - MIN_P: Min-p sampling + * - TEMPERATURE: Temperature sampling + * @group setParam + */ + def setSamplers(samplers: Array[String]): this.type = { set(this.samplers, samplers) } + + /** Set whether or not generate should apply a chat template + * + * @group setParam + */ + def setUseChatTemplate(useChatTemplate: Boolean): this.type = { + set(this.useChatTemplate, useChatTemplate) + } + + // ---------------- GETTERS ---------------- + /** @group getParam */ + def getInputPrefix: String = $(inputPrefix) + + /** @group getParam */ + def getInputSuffix: String = $(inputSuffix) + + /** @group getParam */ + def getCachePrompt: Boolean = $(cachePrompt) + + def getNPredict: Int = $(nPredict) + + /** @group getParam */ + def getTopK: Int = $(topK) + + /** @group getParam */ + def getTopP: Float = $(topP) + + /** @group getParam */ + def getMinP: Float = $(minP) + + /** @group getParam */ + def getTfsZ: Float = $(tfsZ) + + /** @group getParam */ + def getTypicalP: Float = $(typicalP) + + /** @group getParam */ + def getTemperature: Float = $(temperature) + + /** @group getParam */ + def getDynamicTemperatureRange: Float = $(dynamicTemperatureRange) + + /** @group getParam */ + def getDynamicTemperatureExponent: Float = $(dynamicTemperatureExponent) + + /** @group getParam */ + def getRepeatLastN: Int = $(repeatLastN) + + /** @group getParam */ + def getRepeatPenalty: Float = $(repeatPenalty) + + /** @group getParam */ + def getFrequencyPenalty: Float = $(frequencyPenalty) + + /** @group getParam */ + def getPresencePenalty: Float = $(presencePenalty) + + /** @group getParam */ + def getMiroStat: String = $(miroStat) + + /** @group getParam */ + def getMiroStatTau: Float = $(miroStatTau) + + /** @group getParam */ + def getMiroStatEta: Float = $(miroStatEta) + + /** @group getParam */ + def getPenalizeNl: Boolean = $(penalizeNl) + + /** @group getParam */ + def getNKeep: Int = $(nKeep) + + /** @group getParam */ + def getSeed: Int = $(seed) + + /** @group getParam */ + def getNProbs: Int = $(nProbs) + + /** @group getParam */ + def getMinKeep: Int = $(minKeep) + + /** @group getParam */ + def getGrammar: String = $(grammar) + + /** @group getParam */ + def getPenaltyPrompt: String = $(penaltyPrompt) + + /** @group getParam */ + def getIgnoreEos: Boolean = $(ignoreEos) + + /** @group getParam */ + def getTokenIdBias: Map[Int, Float] = $$(tokenIdBias) + + /** @group getParam */ + def getTokenBias: Map[String, Float] = $$(tokenBias) + + /** @group getParam */ + def getDisableTokenIds: Array[Int] = $(disableTokenIds) + + /** @group getParam */ + def getStopStrings: Array[String] = $(stopStrings) + + /** @group getParam */ + def getSamplers: Array[String] = $(samplers) + + /** @group getParam */ + def getUseChatTemplate: Boolean = $(useChatTemplate) + + protected def getModelParameters: ModelParameters = { + val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled + + if (isDefined(chatTemplate)) modelParameters.setChatTemplate($(chatTemplate)) + if (isDefined(defragmentationThreshold)) + modelParameters.setDefragmentationThreshold($(defragmentationThreshold)) + if (isDefined(embedding)) modelParameters.setEmbedding($(embedding)) + if (isDefined(flashAttention)) modelParameters.setFlashAttention($(flashAttention)) + if (isDefined(gpuSplitMode)) + modelParameters.setSplitMode(GpuSplitMode.valueOf($(gpuSplitMode))) + if (isDefined(grpAttnN)) modelParameters.setGrpAttnN($(grpAttnN)) + if (isDefined(grpAttnW)) modelParameters.setGrpAttnN($(grpAttnW)) + if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos($(inputPrefixBos)) + if (isDefined(lookupCacheDynamicFilePath)) + modelParameters.setLookupCacheDynamicFilePath($(lookupCacheDynamicFilePath)) + if (isDefined(lookupCacheStaticFilePath)) + modelParameters.setLookupCacheStaticFilePath($(lookupCacheStaticFilePath)) + if (isDefined(mainGpu)) modelParameters.setMainGpu($(mainGpu)) + if (isDefined(modelDraft)) modelParameters.setModelDraft($(modelDraft)) + if (isDefined(nBatch)) modelParameters.setNBatch($(nBatch)) + if (isDefined(nChunks)) modelParameters.setNChunks($(nChunks)) + if (isDefined(nCtx)) modelParameters.setNCtx($(nCtx)) + if (isDefined(nDraft)) modelParameters.setNDraft($(nDraft)) + if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers($(nGpuLayers)) + if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft($(nGpuLayersDraft)) + if (isDefined(nSequences)) modelParameters.setNSequences($(nSequences)) + if (isDefined(nThreads)) modelParameters.setNThreads($(nThreads)) + if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch($(nThreadsBatch)) + if (isDefined(nThreadsBatchDraft)) + modelParameters.setNThreadsBatchDraft($(nThreadsBatchDraft)) + if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft($(nThreadsDraft)) + if (isDefined(nUbatch)) modelParameters.setNUbatch($(nUbatch)) + if (isDefined(noKvOffload)) modelParameters.setNoKvOffload($(noKvOffload)) + if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf($(numaStrategy))) + if (isDefined(pSplit)) modelParameters.setPSplit($(pSplit)) + if (isDefined(poolingType)) + modelParameters.setPoolingType(PoolingType.valueOf($(poolingType))) + if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase($(ropeFreqBase)) + if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale($(ropeFreqScale)) + if (isDefined(ropeScalingType)) + modelParameters.setRopeScalingType(RopeScalingType.valueOf($(ropeScalingType))) + if (isDefined(systemPrompt)) modelParameters.setSystemPrompt($(systemPrompt)) + if (isDefined(tensorSplit)) modelParameters.setTensorSplit($(tensorSplit).map(_.toFloat)) + if (isDefined(useMlock)) modelParameters.setUseMlock($(useMlock)) + if (isDefined(useMmap)) modelParameters.setUseMmap($(useMmap)) + if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor($(yarnAttnFactor)) + if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast($(yarnBetaFast)) + if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow($(yarnBetaSlow)) + if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor($(yarnExtFactor)) + if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx($(yarnOrigCtx)) + if (loraAdapters.isSet) { + val loraAdaptersMap: mutable.Map[String, java.lang.Float] = + mutable.Map($$(loraAdapters).map { case (key, value) => + (key, float2Float(value)) + }.toSeq: _*) + modelParameters.setLoraAdapters(loraAdaptersMap.asJava) + } // Need to convert to mutable map first + + modelParameters + } + + protected def getInferenceParameters: InferenceParameters = { + val inferenceParams = new InferenceParameters("") + if (isDefined(cachePrompt)) inferenceParams.setCachePrompt($(cachePrompt)) + if (isDefined(disableTokenIds)) { + val javaCollection: java.util.Collection[Integer] = + $(disableTokenIds).map(int2Integer).toSeq.asJava + inferenceParams.disableTokenIds(javaCollection) + } + if (isDefined(dynamicTemperatureExponent)) + inferenceParams.setDynamicTemperatureExponent($(dynamicTemperatureExponent)) + if (isDefined(dynamicTemperatureRange)) + inferenceParams.setDynamicTemperatureRange($(dynamicTemperatureRange)) + if (isDefined(frequencyPenalty)) inferenceParams.setFrequencyPenalty($(frequencyPenalty)) + if (isDefined(grammar)) inferenceParams.setGrammar($(grammar)) + if (isDefined(ignoreEos)) inferenceParams.setIgnoreEos($(ignoreEos)) + if (isDefined(inputPrefix)) inferenceParams.setInputPrefix($(inputPrefix)) + if (isDefined(inputSuffix)) inferenceParams.setInputSuffix($(inputSuffix)) + if (isDefined(minKeep)) inferenceParams.setMinKeep($(minKeep)) + if (isDefined(minP)) inferenceParams.setMinP($(minP)) + if (isDefined(miroStat)) inferenceParams.setMiroStat(MiroStat.valueOf($(miroStat))) + if (isDefined(miroStatEta)) inferenceParams.setMiroStatEta($(miroStatEta)) + if (isDefined(miroStatTau)) inferenceParams.setMiroStatTau($(miroStatTau)) + if (isDefined(nKeep)) inferenceParams.setNKeep($(nKeep)) + if (isDefined(nPredict)) inferenceParams.setNPredict($(nPredict)) + if (isDefined(nProbs)) inferenceParams.setNProbs($(nProbs)) + if (isDefined(penalizeNl)) inferenceParams.setPenalizeNl($(penalizeNl)) + if (isDefined(penaltyPrompt)) inferenceParams.setPenaltyPrompt($(penaltyPrompt)) + if (isDefined(presencePenalty)) inferenceParams.setPresencePenalty($(presencePenalty)) + if (isDefined(repeatLastN)) inferenceParams.setRepeatLastN($(repeatLastN)) + if (isDefined(repeatPenalty)) inferenceParams.setRepeatPenalty($(repeatPenalty)) + if (isDefined(samplers)) inferenceParams.setSamplers($(samplers).map(Sampler.valueOf): _*) + if (isDefined(seed)) inferenceParams.setSeed($(seed)) + if (isDefined(stopStrings)) inferenceParams.setStopStrings($(stopStrings): _*) + if (isDefined(temperature)) inferenceParams.setTemperature($(temperature)) + if (isDefined(tfsZ)) inferenceParams.setTfsZ($(tfsZ)) + if (isDefined(topK)) inferenceParams.setTopK($(topK)) + if (isDefined(topP)) inferenceParams.setTopP($(topP)) + if (isDefined(typicalP)) inferenceParams.setTypicalP($(typicalP)) + if (isDefined(useChatTemplate)) inferenceParams.setUseChatTemplate($(useChatTemplate)) + if (tokenBias.isSet) { + val tokenBiasMap: mutable.Map[String, java.lang.Float] = mutable.Map($$(tokenBias).map { + case (key, value) => (key, float2Float(value)) + }.toSeq: _*) + inferenceParams.setTokenBias(tokenBiasMap.asJava) + } + if (tokenIdBias.isSet) { + val tokenIdBiasMap: mutable.Map[Integer, java.lang.Float] = + mutable.Map($$(tokenIdBias).map { case (key, value) => + (int2Integer(key), float2Float(value)) + }.toSeq: _*) + inferenceParams.setTokenIdBias(tokenIdBiasMap.asJava) + } + + inferenceParams + } + + // ---------------- METADATA ---------------- + val metadata = + new Param[String](this, "metadata", "Set the metadata for the model").setProtected() + + /** Set the metadata for the model + * @group setParam + */ + def setMetadata(metadata: String): this.type = { set(this.metadata, metadata) } + + /** Get the metadata for the model + * @group getParam + */ + def getMetadata: String = $(metadata) +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index 98387ad04fb14a..1b46ec8330bc48 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -812,11 +812,13 @@ package object annotator { object UAEEmbeddings extends ReadablePretrainedUAEModel with ReadUAEDLModel + type AutoGGUFModel = com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel + object AutoGGUFModel extends ReadablePretrainedAutoGGUFModel with ReadAutoGGUFModel + type MxbaiEmbeddings = com.johnsnowlabs.nlp.embeddings.MxbaiEmbeddings object MxbaiEmbeddings extends ReadablePretrainedMxbaiModel with ReadMxbaiDLModel - type SnowFlakeEmbeddings = com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala new file mode 100644 index 00000000000000..11d41bdb4d739a --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -0,0 +1,271 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.nlp.annotators.seq2seq + +import com.johnsnowlabs.ml.gguf.GGUFWrapper +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.nlp.llama.LlamaModel +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods + +/** Annotator that uses the llama.cpp library to generate text completions with large language + * models. + * + * For settable parameters, and their explanations, see [[HasLlamaCppProperties]] and refer to + * the llama.cpp documentation of + * [[https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server server.cpp]] + * for more information. + * + * If the parameters are not set, the annotator will default to use the parameters provided by + * the model. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val autoGGUFModel = AutoGGUFModel.pretrained() + * .setInputCols("document") + * .setOutputCol("completions") + * }}} + * The default model is `"gguf-phi3-mini-4k-instruct-q4"`, if no name is provided. + * + * For available pretrained models please see the [[https://sparknlp.org/models Models Hub]]. + * + * For extended examples of usage, see the + * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala AutoGGUFModelTest]] + * and the + * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb example notebook]]. + * + * ==Note== + * To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set + * the number of GPU layers with the `setNGpuLayers` method. + * + * When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` + * according to your hardware to avoid out-of-memory errors. + * + * ==Example== + * + * {{{ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * import spark.implicits._ + * + * val document = new DocumentAssembler() + * .setInputCol("text") + * .setOutputCol("document") + * + * val autoGGUFModel = AutoGGUFModel + * .pretrained() + * .setInputCols("document") + * .setOutputCol("completions") + * .setBatchSize(4) + * .setNPredict(20) + * .setNGpuLayers(99) + * .setTemperature(0.4f) + * .setTopK(40) + * .setTopP(0.9f) + * .setPenalizeNl(true) + * + * val pipeline = new Pipeline().setStages(Array(document, autoGGUFModel)) + * + * val data = Seq("Hello, I am a").toDF("text") + * val result = pipeline.fit(data).transform(data) + * result.select("completions").show(truncate = false) + * +-----------------------------------------------------------------------------------------------------------------------------------+ + * |completions | + * +-----------------------------------------------------------------------------------------------------------------------------------+ + * |[{document, 0, 78, new user. I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]| + * +-----------------------------------------------------------------------------------------------------------------------------------+ + * }}} + * + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class AutoGGUFModel(override val uid: String) + extends AnnotatorModel[AutoGGUFModel] + with HasBatchedAnnotate[AutoGGUFModel] + with HasEngine + with HasLlamaCppProperties + with HasProtectedParams { + + override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT + override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.DOCUMENT) + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("AutoGGUFModel")) + + private var _model: Option[Broadcast[GGUFWrapper]] = None + + // Values for automatic GPU support + private val defaultGpuLayers = 1000 + private val defaultMainGpu = 0 + + /** @group getParam */ + def getModelIfNotSet: GGUFWrapper = _model.get.value + + /** @group setParam */ + def setModelIfNotSet(spark: SparkSession, wrapper: GGUFWrapper): this.type = { + if (_model.isEmpty) { + _model = Some(spark.sparkContext.broadcast(wrapper)) + } + + // Entrypoint for models. Automatically set GPU support if detected. + val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu")) + if (usingGPUJar) { + logger.info("Using GPU jar. Offloading all layers to GPU.") + setMainGpu(defaultMainGpu) + setNGpuLayers(defaultGpuLayers) + } + this + } + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + getModelIfNotSet.saveToFile(path) + } + + /** Completes the batch of annotations. + * + * @param batchedAnnotations + * Annotations (single element arrays) in batches + * @return + * Completed text sequences + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + val annotations: Seq[Annotation] = batchedAnnotations.flatten + if (annotations.nonEmpty) { + + val modelParams = + getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size + val inferenceParams = getInferenceParameters + + val model: LlamaModel = getModelIfNotSet.getSession(modelParams) + + val annotationsText = annotations.map(_.result) + + val (completedTexts: Array[String], metadata: Map[String, String]) = + try { + (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty) + } catch { + case e: Exception => + logger.error("Error in llama.cpp batch completion", e) + (Array[String](), Map("exception" -> e.getMessage)) + } + + val result: Seq[Seq[Annotation]] = + annotations.zip(completedTexts).map { case (annotation, text) => + Seq( + new Annotation( + outputAnnotatorType, + 0, + text.length - 1, + text, + annotation.metadata ++ metadata)) + } + result + } else Seq(Seq.empty[Annotation]) + } + + def getMetadataMap: Map[String, String] = { + val metadataJsonString = getMetadata + if (metadataJsonString.isEmpty) Map.empty + else { + implicit val formats: DefaultFormats.type = DefaultFormats + JsonMethods.parse(metadataJsonString).extract[Map[String, String]] + } + } +} + +trait ReadablePretrainedAutoGGUFModel + extends ParamsAndFeaturesReadable[AutoGGUFModel] + with HasPretrained[AutoGGUFModel] { + override val defaultModelName: Some[String] = Some("gguf-phi3-mini-4k-instruct-q4") + override val defaultLang: String = "en" + + /** Java compliant-overrides */ + override def pretrained(): AutoGGUFModel = super.pretrained() + + override def pretrained(name: String): AutoGGUFModel = super.pretrained(name) + + override def pretrained(name: String, lang: String): AutoGGUFModel = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): AutoGGUFModel = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadAutoGGUFModel { + this: ParamsAndFeaturesReadable[AutoGGUFModel] => + + def readModel(instance: AutoGGUFModel, path: String, spark: SparkSession): Unit = { + def findGGUFModelInFolder(): String = { + val folder = new java.io.File(path) + if (folder.exists && folder.isDirectory) { + folder.listFiles + .filter(_.isFile) + .filter(_.getName.endsWith(".gguf")) + .map(_.getAbsolutePath) + .headOption // Should only be one file + .getOrElse(throw new IllegalArgumentException(s"Could not find GGUF model in $path")) + } else { + throw new IllegalArgumentException(s"Path $path is not a directory") + } + } + + val model = AutoGGUFModel.loadSavedModel(findGGUFModelInFolder(), spark) + instance.setModelIfNotSet(spark, model.getModelIfNotSet) + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): AutoGGUFModel = { + // TODO potentially enable download from HF-URLS + val localPath: String = ResourceHelper.copyToLocal(modelPath) + val annotatorModel = new AutoGGUFModel() + annotatorModel + .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath)) + + val metadata = LlamaModel.getMetadataFromFile(localPath) + if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) + annotatorModel + } +} + +/** This is the companion object of [[AutoGGUFModel]]. Please refer to that class for the + * documentation. + */ +object AutoGGUFModel extends ReadablePretrainedAutoGGUFModel with ReadAutoGGUFModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index c4d887c3e03934..8ed41de985baa9 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -32,14 +32,7 @@ import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel import com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel -import com.johnsnowlabs.nlp.annotators.seq2seq.{ - BartTransformer, - GPT2Transformer, - LLAMA2Transformer, - M2M100Transformer, - MarianTransformer, - T5Transformer -} +import com.johnsnowlabs.nlp.annotators.seq2seq._ import com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel @@ -691,9 +684,10 @@ object PythonResourceDownloader { "LLAMA2Transformer" -> LLAMA2Transformer, "M2M100Transformer" -> M2M100Transformer, "UAEEmbeddings" -> UAEEmbeddings, + "AutoGGUFModel" -> AutoGGUFModel, "AlbertForZeroShotClassification" -> AlbertForZeroShotClassification, "MxbaiEmbeddings" -> MxbaiEmbeddings, - "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings, + "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings ) // List pairs of types such as the one with key type can load a pretrained model from the value type diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index 703f281a1da1d1..6a17297f6fda41 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -1,4 +1,4 @@ -log4j.rootLogger=WARN, STDOUT +log4j.rootLogger=DEBUG, STDOUT log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n diff --git a/src/test/resources/log4j2.properties b/src/test/resources/log4j2.properties index 703f281a1da1d1..6a17297f6fda41 100644 --- a/src/test/resources/log4j2.properties +++ b/src/test/resources/log4j2.properties @@ -1,4 +1,4 @@ -log4j.rootLogger=WARN, STDOUT +log4j.rootLogger=DEBUG, STDOUT log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala new file mode 100644 index 00000000000000..b4234f24197b7c --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala @@ -0,0 +1,187 @@ +package com.johnsnowlabs.nlp.annotators.seq2seq + +import com.johnsnowlabs.nlp.Annotation +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.apache.spark.sql.DataFrame +import org.scalatest.flatspec.AnyFlatSpec + +class AutoGGUFModelTest extends AnyFlatSpec { + + import ResourceHelper.spark.implicits._ + + behavior of "AutoGGUFModelTest" + + // Set Spark Debug level + ResourceHelper.spark.sparkContext.setLogLevel("INFO") + + lazy val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + lazy val model = AutoGGUFModel + .pretrained() + .setInputCols("document") + .setOutputCol("completions") + .setBatchSize(4) + .setNPredict(20) + .setNGpuLayers(99) + .setTemperature(0.4f) + .setNCtx(512) + .setTopK(40) + .setTopP(0.9f) + .setPenalizeNl(true) + + lazy val data = Seq( + "The moons of Jupiter are ", // "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones. The four" + "Earth is ", // "Earth is 4.5 billion years old. It has been home to countless species, some of which have gone extinct, while others have evolved into" + "The moon is ", // "The moon is 1/400th the size of the sun. The sun is 1.39 million kilometers in diameter, while" + "The sun is " // + ).toDF("text").repartition(1) + + lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model)) + + def assertAnnotationsNonEmpty(resultDf: DataFrame): Unit = { + Annotation + .collect(resultDf, "completions") + .foreach(annotations => { + println(annotations.head) + assert(annotations.head.result.nonEmpty) + }) + } + + it should "create completions" taggedAs SlowTest in { + val data = Seq("Hello, I am a").toDF("text") + val result = pipeline.fit(data).transform(data) + assertAnnotationsNonEmpty(result) + } + + it should "create batch completions" taggedAs SlowTest in { + val pipeline = new Pipeline().setStages(Array(documentAssembler, model)) + val result = pipeline.fit(data).transform(data) + assertAnnotationsNonEmpty(result) + } + + it should "be serializable" taggedAs SlowTest in { + val data = Seq("Hello, I am a").toDF("text") + lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model)) + model.setNPredict(5) + + val pipelineModel = pipeline.fit(data) + val savePath = "./tmp_autogguf_model" + pipelineModel.stages.last + .asInstanceOf[AutoGGUFModel] + .write + .overwrite() + .save(savePath) + + val loadedModel = AutoGGUFModel.load(savePath) + val newPipeline: Pipeline = new Pipeline().setStages(Array(documentAssembler, loadedModel)) + + newPipeline + .fit(data) + .transform(data) + .select("completions") + .show(truncate = false) + } + + it should "accept all parameters that are settable" taggedAs SlowTest in { + // Model Parameters + model.setNThreads(8) + model.setNThreadsDraft(8) + model.setNThreadsBatch(8) + model.setNThreadsBatchDraft(8) + model.setNCtx(512) + model.setNBatch(32) + model.setNUbatch(32) + model.setNDraft(5) + model.setNChunks(-1) + model.setNSequences(1) + model.setPSplit(0.1f) + model.setNGpuLayers(99) + model.setNGpuLayersDraft(99) + model.setGpuSplitMode("NONE") + model.setMainGpu(0) + model.setTensorSplit(Array[Double]()) + model.setGrpAttnN(1) + model.setGrpAttnW(512) + model.setRopeFreqBase(1.0f) + model.setRopeFreqScale(1.0f) + model.setYarnExtFactor(1.0f) + model.setYarnAttnFactor(1.0f) + model.setYarnBetaFast(32.0f) + model.setYarnBetaSlow(1.0f) + model.setYarnOrigCtx(0) + model.setDefragmentationThreshold(-1.0f) + model.setNumaStrategy("DISTRIBUTE") + model.setRopeScalingType("UNSPECIFIED") + model.setPoolingType("UNSPECIFIED") + model.setModelDraft("") + model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache") + model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache") + model.setEmbedding(false) + model.setFlashAttention(false) + model.setInputPrefixBos(false) + model.setUseMmap(false) + model.setUseMlock(false) + model.setNoKvOffload(false) + model.setSystemPrompt("") + model.setChatTemplate("") + + // Inference Parameters + model.setInputPrefix("") + model.setInputSuffix("") + model.setCachePrompt(false) + model.setNPredict(-1) + model.setTopK(40) + model.setTopP(0.9f) + model.setMinP(0.1f) + model.setTfsZ(1.0f) + model.setTypicalP(1.0f) + model.setTemperature(0.8f) + model.setDynamicTemperatureRange(0.0f) + model.setDynamicTemperatureExponent(1.0f) + model.setRepeatLastN(64) + model.setRepeatPenalty(1.0f) + model.setFrequencyPenalty(0.0f) + model.setPresencePenalty(0.0f) + model.setMiroStat("DISABLED") + model.setMiroStatTau(5.0f) + model.setMiroStatEta(0.1f) + model.setPenalizeNl(false) + model.setNKeep(0) + model.setSeed(-1) + model.setNProbs(0) + model.setMinKeep(0) + model.setGrammar("") + model.setPenaltyPrompt("") + model.setIgnoreEos(false) + model.setDisableTokenIds(Array[Int]()) + model.setStopStrings(Array[String]()) + model.setUseChatTemplate(false) + model.setNPredict(2) + model.setSamplers(Array("TOP_P", "TOP_K")) + + // Struct Features + model.setTokenIdBias(Map(0 -> 0.0f, 1 -> 0.0f)) + model.setTokenBias(Map("!" -> 0.0f, "?" -> 0.0f)) + model.setLoraAdapters(Map(" " -> 0.0f)) + + lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model)) + + val result = pipeline.fit(data).transform(data) + result.select("completions").show(truncate = false) + } + + it should "contain metadata when loadSavedModel" taggedAs SlowTest in { + lazy val modelPath = "models/codellama-7b.Q2_K.gguf" + val model = AutoGGUFModel.loadSavedModel(modelPath, ResourceHelper.spark) + val metadata = model.getMetadata + assert(metadata.nonEmpty) + + val metadataMap = model.getMetadataMap + assert(metadataMap.nonEmpty) + } +}