diff --git a/build.sbt b/build.sbt
index cdbe01697bd207..4f35f22f8ae570 100644
--- a/build.sbt
+++ b/build.sbt
@@ -6,7 +6,7 @@ name := getPackageName(is_silicon, is_gpu, is_aarch64)
 
 organization := "com.johnsnowlabs.nlp"
 
-version := "5.4.2"
+version := "5.5.0"
 
 (ThisBuild / scalaVersion) := scalaVer
 
@@ -180,6 +180,16 @@ val onnxDependencies: Seq[sbt.ModuleID] =
   else
     Seq(onnxCPU)
 
+val llamaCppDependencies =
+  if (is_gpu.equals("true"))
+    Seq(llamaCppGPU)
+  else if (is_silicon.equals("true"))
+    Seq(llamaCppSilicon)
+//  else if (is_aarch64.equals("true"))
+//    Seq(openVinoCPU)
+  else
+    Seq(llamaCppCPU)
+
 val openVinoDependencies: Seq[sbt.ModuleID] =
   if (is_gpu.equals("true"))
     Seq(openVinoGPU)
@@ -202,6 +212,7 @@ lazy val root = (project in file("."))
         utilDependencies ++
         tensorflowDependencies ++
         onnxDependencies ++
+        llamaCppDependencies ++
         openVinoDependencies ++
         typedDependencyParserDependencies,
     // TODO potentially improve this?
diff --git a/docs/en/annotator_entries/AutoGGUF.md b/docs/en/annotator_entries/AutoGGUF.md
new file mode 100644
index 00000000000000..4bf8384004b0e0
--- /dev/null
+++ b/docs/en/annotator_entries/AutoGGUF.md
@@ -0,0 +1,135 @@
+{%- capture title -%}
+AutoGGUFModel
+{%- endcapture -%}
+
+{%- capture description -%}
+Annotator that uses the llama.cpp library to generate text completions with large language
+models.
+
+For settable parameters, and their explanations, see [HasLlamaCppProperties](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala) and refer to
+the llama.cpp documentation of
+[server.cpp](https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server)
+for more information.
+
+If the parameters are not set, the annotator will default to use the parameters provided by
+the model.
+
+Pretrained models can be loaded with `pretrained` of the companion object:
+
+```scala
+val autoGGUFModel = AutoGGUFModel.pretrained()
+  .setInputCols("document")
+  .setOutputCol("completions")
+```
+
+The default model is `"gguf-phi3-mini-4k-instruct-q4"`, if no name is provided.
+
+For available pretrained models please see the [Models Hub](https://sparknlp.org/models).
+
+For extended examples of usage, see the
+[AutoGGUFModelTest](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala)
+and the
+[example notebook](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb).
+
+**Note**: To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
+the number of GPU layers with the `setNGpuLayers` method.
+
+When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
+according to your hardware to avoid out-of-memory errors.
+{%- endcapture -%}
+
+{%- capture input_anno -%}
+DOCUMENT
+{%- endcapture -%}
+
+{%- capture output_anno -%}
+DOCUMENT
+{%- endcapture -%}
+
+{%- capture python_example -%}
+>>> import sparknlp
+>>> from sparknlp.base import *
+>>> from sparknlp.annotator import *
+>>> from pyspark.ml import Pipeline
+>>> document = DocumentAssembler() \
+...     .setInputCol("text") \
+...     .setOutputCol("document")
+>>> autoGGUFModel = AutoGGUFModel.pretrained() \
+...     .setInputCols(["document"]) \
+...     .setOutputCol("completions") \
+...     .setBatchSize(4) \
+...     .setNPredict(20) \
+...     .setNGpuLayers(99) \
+...     .setTemperature(0.4) \
+...     .setTopK(40) \
+...     .setTopP(0.9) \
+...     .setPenalizeNl(True)
+>>> pipeline = Pipeline().setStages([document, autoGGUFModel])
+>>> data = spark.createDataFrame([["Hello, I am a"]]).toDF("text")
+>>> result = pipeline.fit(data).transform(data)
+>>> result.select("completions").show(truncate = False)
++-----------------------------------------------------------------------------------------------------------------------------------+
+|completions                                                                                                                        |
++-----------------------------------------------------------------------------------------------------------------------------------+
+|[{document, 0, 78,  new user.  I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]|
++-----------------------------------------------------------------------------------------------------------------------------------+
+{%- endcapture -%}
+
+{%- capture scala_example -%}
+import com.johnsnowlabs.nlp.base._
+import com.johnsnowlabs.nlp.annotator._
+import org.apache.spark.ml.Pipeline
+import spark.implicits._
+
+val document = new DocumentAssembler()
+  .setInputCol("text")
+  .setOutputCol("document")
+
+val autoGGUFModel = AutoGGUFModel
+  .pretrained()
+  .setInputCols("document")
+  .setOutputCol("completions")
+  .setBatchSize(4)
+  .setNPredict(20)
+  .setNGpuLayers(99)
+  .setTemperature(0.4f)
+  .setTopK(40)
+  .setTopP(0.9f)
+  .setPenalizeNl(true)
+
+val pipeline = new Pipeline().setStages(Array(document, autoGGUFModel))
+
+val data = Seq("Hello, I am a").toDF("text")
+val result = pipeline.fit(data).transform(data)
+result.select("completions").show(truncate = false)
++-----------------------------------------------------------------------------------------------------------------------------------+
+|completions                                                                                                                        |
++-----------------------------------------------------------------------------------------------------------------------------------+
+|[{document, 0, 78,  new user.  I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]|
++-----------------------------------------------------------------------------------------------------------------------------------+
+
+{%- endcapture -%}
+
+{%- capture api_link -%}
+[AutoGGUFModel](/api/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel)
+{%- endcapture -%}
+
+{%- capture python_api_link -%}
+[AutoGGUFModel](/api/python/reference/autosummary/sparknlp/annotator/seq2seq/auto_gguf_model/index.html)
+{%- endcapture -%}
+
+{%- capture source_link -%}
+[AutoGGUFModel](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala)
+{%- endcapture -%}
+
+{% include templates/anno_template.md
+title=title
+description=description
+input_anno=input_anno
+output_anno=output_anno
+python_example=python_example
+scala_example=scala_example
+api_link=api_link
+python_api_link=python_api_link
+source_link=source_link
+%}
\ No newline at end of file
diff --git a/docs/en/annotators.md b/docs/en/annotators.md
index b65eae52cc7f12..161bd8f8e3f496 100644
--- a/docs/en/annotators.md
+++ b/docs/en/annotators.md
@@ -45,6 +45,7 @@ There are two types of Annotators:
 {:.table-model-big}
 |Annotator|Description|Version |
 |---|---|---|
+{% include templates/anno_table_entry.md path="" name="AutoGGUFModel" summary="Annotator that uses the llama.cpp library to generate text completions with large language models."%}
 {% include templates/anno_table_entry.md path="" name="BGEEmbeddings" summary="Sentence embeddings using BGE."%}
 {% include templates/anno_table_entry.md path="" name="BigTextMatcher" summary="Annotator to match exact phrases (by token) provided in a file against a Document."%}
 {% include templates/anno_table_entry.md path="" name="Chunk2Doc" summary="Converts a `CHUNK` type column back into `DOCUMENT`. Useful when trying to re-tokenize or do further analysis on a `CHUNK` result."%}
diff --git a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb
new file mode 100644
index 00000000000000..f07f3892e2d38f
--- /dev/null
+++ b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb
@@ -0,0 +1,628 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb)\n",
+    "\n",
+    "# Import llama.cpp 🦙 models into Spark NLP 🚀\n",
+    "\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "- llama.cpp support was introduced in `Spark NLP 5.5.0`, enabling quantized LLM inference on a wide range of devices. Please make sure you have upgraded to the latest Spark NLP release.\n",
+    "- You need to use your own `.gguf` model files, which also include the models from the [Hugging Face Models](https://huggingface.co/models?library=gguf)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download a GGUF Model\n",
+    "\n",
+    "Lets download a GGUF model to test it out. For this, we will use [microsoft/Phi-3-mini-4k-instruct-gguf](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf). It is a 3.8B parameter model which also is available in 4-bit quantization. \n",
+    "\n",
+    "We can download the model by selecting the q4 GGUF file from the \"Files and versions\" tab.\n",
+    "\n",
+    "Once downloaded, we can directly import this model into Spark NLP!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-07-20 11:11:30--  https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf?download=true\n",
+      "Resolving huggingface.co (huggingface.co)... 2600:9000:275f:7600:17:b174:6d00:93a1, 2600:9000:275f:3800:17:b174:6d00:93a1, 2600:9000:275f:6e00:17:b174:6d00:93a1, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|2600:9000:275f:7600:17:b174:6d00:93a1|:443... connected.\n",
+      "HTTP request sent, awaiting response... 302 Found\n",
+      "Location: https://cdn-lfs-us-1.huggingface.co/repos/41/c8/41c860f65b01de5dc4c68b00d84cead799d3e7c48e38ee749f4c6057776e2e9e/8a83c7fb9049a9b2e92266fa7ad04933bb53aa1e85136b7b30f1b8000ff2edef?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27Phi-3-mini-4k-instruct-q4.gguf%3B+filename%3D%22Phi-3-mini-4k-instruct-q4.gguf%22%3B&Expires=1721725890&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTcyNTg5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzQxL2M4LzQxYzg2MGY2NWIwMWRlNWRjNGM2OGIwMGQ4NGNlYWQ3OTlkM2U3YzQ4ZTM4ZWU3NDlmNGM2MDU3Nzc2ZTJlOWUvOGE4M2M3ZmI5MDQ5YTliMmU5MjI2NmZhN2FkMDQ5MzNiYjUzYWExZTg1MTM2YjdiMzBmMWI4MDAwZmYyZWRlZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=joXQf4QRpEhtFeQ3r3gJ0zyJ3bXReb9OxM%7EZit3GJ3355ycKQzemJ%7E6eD-J7%7EkphnsPpRpUDhQkCr2-Oidqo7dgltmFsWAX4SmQLn65R1yjO%7EsMvi%7E4vOUpaRPYlSMCyWWJpiZZjQYVH4Uk0o-G62ALFXKGaDfr627kvahP-fJYwNNP1riTrH8hbbah28ZKRAQjUGI1aNqerG0jojudnGOagawISAnudkAOFZfxnN7Qw3CoMySZLj9Euu02RBv2A5Yy0uSjG7b8rilx-tU5HDR3ECohdQQ8yPXjYFU-LZi-zcG1wwBDF-S01qb%7EgPWsTorenxfRM2cG6J%7EvSziGCzA__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
+      "--2024-07-20 11:11:30--  https://cdn-lfs-us-1.huggingface.co/repos/41/c8/41c860f65b01de5dc4c68b00d84cead799d3e7c48e38ee749f4c6057776e2e9e/8a83c7fb9049a9b2e92266fa7ad04933bb53aa1e85136b7b30f1b8000ff2edef?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27Phi-3-mini-4k-instruct-q4.gguf%3B+filename%3D%22Phi-3-mini-4k-instruct-q4.gguf%22%3B&Expires=1721725890&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMTcyNTg5MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzQxL2M4LzQxYzg2MGY2NWIwMWRlNWRjNGM2OGIwMGQ4NGNlYWQ3OTlkM2U3YzQ4ZTM4ZWU3NDlmNGM2MDU3Nzc2ZTJlOWUvOGE4M2M3ZmI5MDQ5YTliMmU5MjI2NmZhN2FkMDQ5MzNiYjUzYWExZTg1MTM2YjdiMzBmMWI4MDAwZmYyZWRlZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=joXQf4QRpEhtFeQ3r3gJ0zyJ3bXReb9OxM%7EZit3GJ3355ycKQzemJ%7E6eD-J7%7EkphnsPpRpUDhQkCr2-Oidqo7dgltmFsWAX4SmQLn65R1yjO%7EsMvi%7E4vOUpaRPYlSMCyWWJpiZZjQYVH4Uk0o-G62ALFXKGaDfr627kvahP-fJYwNNP1riTrH8hbbah28ZKRAQjUGI1aNqerG0jojudnGOagawISAnudkAOFZfxnN7Qw3CoMySZLj9Euu02RBv2A5Yy0uSjG7b8rilx-tU5HDR3ECohdQQ8yPXjYFU-LZi-zcG1wwBDF-S01qb%7EgPWsTorenxfRM2cG6J%7EvSziGCzA__&Key-Pair-Id=K24J24Z295AEI9\n",
+      "Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 2600:9000:275f:7a00:17:9a40:4dc0:93a1, 2600:9000:275f:fc00:17:9a40:4dc0:93a1, 2600:9000:275f:4800:17:9a40:4dc0:93a1, ...\n",
+      "Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|2600:9000:275f:7a00:17:9a40:4dc0:93a1|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 2393231072 (2.2G) [binary/octet-stream]\n",
+      "Saving to: ‘Phi-3-mini-4k-instruct-q4.gguf?download=true’\n",
+      "\n",
+      "Phi-3-mini-4k-instr 100%[===================>]   2.23G  22.5MB/s    in 96s     \n",
+      "\n",
+      "2024-07-20 11:13:06 (23.7 MB/s) - ‘Phi-3-mini-4k-instruct-q4.gguf?download=true’ saved [2393231072/2393231072]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "EXPORT_PATH = \"Phi-3-mini-4k-instruct-q4.gguf\"\n",
+    "! wget \"https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf?download=true\" -O  {EXPORT_PATH}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import and Save AutGGUF models in Spark NLP\n",
+    "\n",
+    "- Let's install and setup Spark NLP (if running it Google Colab)\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only execute this if you are on Google Colab\n",
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start Spark with Spark NLP included via our simple `start()` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "24/07/21 10:51:01 WARN Utils: Your hostname, pop-os resolves to a loopback address: 127.0.1.1; using 192.168.0.34 instead (on interface enp3s0)\n",
+      "24/07/21 10:51:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ":: loading settings :: url = jar:file:/home/ducha/mambaforge/envs/sparknlp_dev/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Ivy Default Cache set to: /home/ducha/.ivy2/cache\n",
+      "The jars for the packages stored in: /home/ducha/.ivy2/jars\n",
+      "com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency\n",
+      ":: resolving dependencies :: org.apache.spark#spark-submit-parent-994cb793-bb56-4b46-ad2f-b20d68529970;1.0\n",
+      "\tconfs: [default]\n",
+      "\tfound com.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 in central\n",
+      "\tfound com.typesafe#config;1.4.2 in central\n",
+      "\tfound org.rocksdb#rocksdbjni;6.29.5 in central\n",
+      "\tfound com.amazonaws#aws-java-sdk-s3;1.12.500 in central\n",
+      "\tfound com.amazonaws#aws-java-sdk-kms;1.12.500 in central\n",
+      "\tfound com.amazonaws#aws-java-sdk-core;1.12.500 in central\n",
+      "\tfound commons-logging#commons-logging;1.1.3 in central\n",
+      "\tfound commons-codec#commons-codec;1.15 in central\n",
+      "\tfound org.apache.httpcomponents#httpclient;4.5.13 in central\n",
+      "\tfound org.apache.httpcomponents#httpcore;4.4.13 in central\n",
+      "\tfound software.amazon.ion#ion-java;1.0.2 in central\n",
+      "\tfound joda-time#joda-time;2.8.1 in central\n",
+      "\tfound com.amazonaws#jmespath-java;1.12.500 in central\n",
+      "\tfound com.github.universal-automata#liblevenshtein;3.0.0 in central\n",
+      "\tfound com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central\n",
+      "\tfound com.google.protobuf#protobuf-java;3.0.0-beta-3 in central\n",
+      "\tfound com.google.code.gson#gson;2.3 in central\n",
+      "\tfound it.unimi.dsi#fastutil;7.0.12 in central\n",
+      "\tfound org.projectlombok#lombok;1.16.8 in central\n",
+      "\tfound com.google.cloud#google-cloud-storage;2.20.1 in central\n",
+      "\tfound com.google.guava#guava;31.1-jre in central\n",
+      "\tfound com.google.guava#failureaccess;1.0.1 in central\n",
+      "\tfound com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in central\n",
+      "\tfound com.google.errorprone#error_prone_annotations;2.18.0 in central\n",
+      "\tfound com.google.j2objc#j2objc-annotations;1.3 in central\n",
+      "\tfound com.google.http-client#google-http-client;1.43.0 in central\n",
+      "\tfound io.opencensus#opencensus-contrib-http-util;0.31.1 in central\n",
+      "\tfound com.google.http-client#google-http-client-jackson2;1.43.0 in central\n",
+      "\tfound com.google.http-client#google-http-client-gson;1.43.0 in central\n",
+      "\tfound com.google.api-client#google-api-client;2.2.0 in central\n",
+      "\tfound com.google.oauth-client#google-oauth-client;1.34.1 in central\n",
+      "\tfound com.google.http-client#google-http-client-apache-v2;1.43.0 in central\n",
+      "\tfound com.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 in central\n",
+      "\tfound com.google.code.gson#gson;2.10.1 in central\n",
+      "\tfound com.google.cloud#google-cloud-core;2.12.0 in central\n",
+      "\tfound io.grpc#grpc-context;1.53.0 in central\n",
+      "\tfound com.google.auto.value#auto-value-annotations;1.10.1 in central\n",
+      "\tfound com.google.auto.value#auto-value;1.10.1 in central\n",
+      "\tfound javax.annotation#javax.annotation-api;1.3.2 in central\n",
+      "\tfound com.google.cloud#google-cloud-core-http;2.12.0 in central\n",
+      "\tfound com.google.http-client#google-http-client-appengine;1.43.0 in central\n",
+      "\tfound com.google.api#gax-httpjson;0.108.2 in central\n",
+      "\tfound com.google.cloud#google-cloud-core-grpc;2.12.0 in central\n",
+      "\tfound io.grpc#grpc-alts;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-grpclb;1.53.0 in central\n",
+      "\tfound org.conscrypt#conscrypt-openjdk-uber;2.5.2 in central\n",
+      "\tfound io.grpc#grpc-auth;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-protobuf;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-protobuf-lite;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-core;1.53.0 in central\n",
+      "\tfound com.google.api#gax;2.23.2 in central\n",
+      "\tfound com.google.api#gax-grpc;2.23.2 in central\n",
+      "\tfound com.google.auth#google-auth-library-credentials;1.16.0 in central\n",
+      "\tfound com.google.auth#google-auth-library-oauth2-http;1.16.0 in central\n",
+      "\tfound com.google.api#api-common;2.6.2 in central\n",
+      "\tfound io.opencensus#opencensus-api;0.31.1 in central\n",
+      "\tfound com.google.api.grpc#proto-google-iam-v1;1.9.2 in central\n",
+      "\tfound com.google.protobuf#protobuf-java;3.21.12 in central\n",
+      "\tfound com.google.protobuf#protobuf-java-util;3.21.12 in central\n",
+      "\tfound com.google.api.grpc#proto-google-common-protos;2.14.2 in central\n",
+      "\tfound org.threeten#threetenbp;1.6.5 in central\n",
+      "\tfound com.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha in central\n",
+      "\tfound com.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha in central\n",
+      "\tfound com.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha in central\n",
+      "\tfound com.google.code.findbugs#jsr305;3.0.2 in central\n",
+      "\tfound io.grpc#grpc-api;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-stub;1.53.0 in central\n",
+      "\tfound org.checkerframework#checker-qual;3.31.0 in central\n",
+      "\tfound io.perfmark#perfmark-api;0.26.0 in central\n",
+      "\tfound com.google.android#annotations;4.1.1.4 in central\n",
+      "\tfound org.codehaus.mojo#animal-sniffer-annotations;1.22 in central\n",
+      "\tfound io.opencensus#opencensus-proto;0.2.0 in central\n",
+      "\tfound io.grpc#grpc-services;1.53.0 in central\n",
+      "\tfound com.google.re2j#re2j;1.6 in central\n",
+      "\tfound io.grpc#grpc-netty-shaded;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-googleapis;1.53.0 in central\n",
+      "\tfound io.grpc#grpc-xds;1.53.0 in central\n",
+      "\tfound com.navigamez#greex;1.0 in central\n",
+      "\tfound dk.brics.automaton#automaton;1.11-8 in central\n",
+      "\tfound com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 in central\n",
+      "\tfound com.microsoft.onnxruntime#onnxruntime;1.17.0 in central\n",
+      ":: resolution report :: resolve 843ms :: artifacts dl 40ms\n",
+      "\t:: modules in use:\n",
+      "\tcom.amazonaws#aws-java-sdk-core;1.12.500 from central in [default]\n",
+      "\tcom.amazonaws#aws-java-sdk-kms;1.12.500 from central in [default]\n",
+      "\tcom.amazonaws#aws-java-sdk-s3;1.12.500 from central in [default]\n",
+      "\tcom.amazonaws#jmespath-java;1.12.500 from central in [default]\n",
+      "\tcom.github.universal-automata#liblevenshtein;3.0.0 from central in [default]\n",
+      "\tcom.google.android#annotations;4.1.1.4 from central in [default]\n",
+      "\tcom.google.api#api-common;2.6.2 from central in [default]\n",
+      "\tcom.google.api#gax;2.23.2 from central in [default]\n",
+      "\tcom.google.api#gax-grpc;2.23.2 from central in [default]\n",
+      "\tcom.google.api#gax-httpjson;0.108.2 from central in [default]\n",
+      "\tcom.google.api-client#google-api-client;2.2.0 from central in [default]\n",
+      "\tcom.google.api.grpc#gapic-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n",
+      "\tcom.google.api.grpc#grpc-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n",
+      "\tcom.google.api.grpc#proto-google-cloud-storage-v2;2.20.1-alpha from central in [default]\n",
+      "\tcom.google.api.grpc#proto-google-common-protos;2.14.2 from central in [default]\n",
+      "\tcom.google.api.grpc#proto-google-iam-v1;1.9.2 from central in [default]\n",
+      "\tcom.google.apis#google-api-services-storage;v1-rev20220705-2.0.0 from central in [default]\n",
+      "\tcom.google.auth#google-auth-library-credentials;1.16.0 from central in [default]\n",
+      "\tcom.google.auth#google-auth-library-oauth2-http;1.16.0 from central in [default]\n",
+      "\tcom.google.auto.value#auto-value;1.10.1 from central in [default]\n",
+      "\tcom.google.auto.value#auto-value-annotations;1.10.1 from central in [default]\n",
+      "\tcom.google.cloud#google-cloud-core;2.12.0 from central in [default]\n",
+      "\tcom.google.cloud#google-cloud-core-grpc;2.12.0 from central in [default]\n",
+      "\tcom.google.cloud#google-cloud-core-http;2.12.0 from central in [default]\n",
+      "\tcom.google.cloud#google-cloud-storage;2.20.1 from central in [default]\n",
+      "\tcom.google.code.findbugs#jsr305;3.0.2 from central in [default]\n",
+      "\tcom.google.code.gson#gson;2.10.1 from central in [default]\n",
+      "\tcom.google.errorprone#error_prone_annotations;2.18.0 from central in [default]\n",
+      "\tcom.google.guava#failureaccess;1.0.1 from central in [default]\n",
+      "\tcom.google.guava#guava;31.1-jre from central in [default]\n",
+      "\tcom.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava from central in [default]\n",
+      "\tcom.google.http-client#google-http-client;1.43.0 from central in [default]\n",
+      "\tcom.google.http-client#google-http-client-apache-v2;1.43.0 from central in [default]\n",
+      "\tcom.google.http-client#google-http-client-appengine;1.43.0 from central in [default]\n",
+      "\tcom.google.http-client#google-http-client-gson;1.43.0 from central in [default]\n",
+      "\tcom.google.http-client#google-http-client-jackson2;1.43.0 from central in [default]\n",
+      "\tcom.google.j2objc#j2objc-annotations;1.3 from central in [default]\n",
+      "\tcom.google.oauth-client#google-oauth-client;1.34.1 from central in [default]\n",
+      "\tcom.google.protobuf#protobuf-java;3.21.12 from central in [default]\n",
+      "\tcom.google.protobuf#protobuf-java-util;3.21.12 from central in [default]\n",
+      "\tcom.google.re2j#re2j;1.6 from central in [default]\n",
+      "\tcom.johnsnowlabs.nlp#spark-nlp_2.12;5.3.3 from central in [default]\n",
+      "\tcom.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.4.4 from central in [default]\n",
+      "\tcom.microsoft.onnxruntime#onnxruntime;1.17.0 from central in [default]\n",
+      "\tcom.navigamez#greex;1.0 from central in [default]\n",
+      "\tcom.typesafe#config;1.4.2 from central in [default]\n",
+      "\tcommons-codec#commons-codec;1.15 from central in [default]\n",
+      "\tcommons-logging#commons-logging;1.1.3 from central in [default]\n",
+      "\tdk.brics.automaton#automaton;1.11-8 from central in [default]\n",
+      "\tio.grpc#grpc-alts;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-api;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-auth;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-context;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-core;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-googleapis;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-grpclb;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-netty-shaded;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-protobuf;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-protobuf-lite;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-services;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-stub;1.53.0 from central in [default]\n",
+      "\tio.grpc#grpc-xds;1.53.0 from central in [default]\n",
+      "\tio.opencensus#opencensus-api;0.31.1 from central in [default]\n",
+      "\tio.opencensus#opencensus-contrib-http-util;0.31.1 from central in [default]\n",
+      "\tio.opencensus#opencensus-proto;0.2.0 from central in [default]\n",
+      "\tio.perfmark#perfmark-api;0.26.0 from central in [default]\n",
+      "\tit.unimi.dsi#fastutil;7.0.12 from central in [default]\n",
+      "\tjavax.annotation#javax.annotation-api;1.3.2 from central in [default]\n",
+      "\tjoda-time#joda-time;2.8.1 from central in [default]\n",
+      "\torg.apache.httpcomponents#httpclient;4.5.13 from central in [default]\n",
+      "\torg.apache.httpcomponents#httpcore;4.4.13 from central in [default]\n",
+      "\torg.checkerframework#checker-qual;3.31.0 from central in [default]\n",
+      "\torg.codehaus.mojo#animal-sniffer-annotations;1.22 from central in [default]\n",
+      "\torg.conscrypt#conscrypt-openjdk-uber;2.5.2 from central in [default]\n",
+      "\torg.projectlombok#lombok;1.16.8 from central in [default]\n",
+      "\torg.rocksdb#rocksdbjni;6.29.5 from central in [default]\n",
+      "\torg.threeten#threetenbp;1.6.5 from central in [default]\n",
+      "\tsoftware.amazon.ion#ion-java;1.0.2 from central in [default]\n",
+      "\t:: evicted modules:\n",
+      "\tcommons-logging#commons-logging;1.2 by [commons-logging#commons-logging;1.1.3] in [default]\n",
+      "\tcommons-codec#commons-codec;1.11 by [commons-codec#commons-codec;1.15] in [default]\n",
+      "\tcom.google.protobuf#protobuf-java-util;3.0.0-beta-3 by [com.google.protobuf#protobuf-java-util;3.21.12] in [default]\n",
+      "\tcom.google.protobuf#protobuf-java;3.0.0-beta-3 by [com.google.protobuf#protobuf-java;3.21.12] in [default]\n",
+      "\tcom.google.code.gson#gson;2.3 by [com.google.code.gson#gson;2.10.1] in [default]\n",
+      "\t---------------------------------------------------------------------\n",
+      "\t|                  |            modules            ||   artifacts   |\n",
+      "\t|       conf       | number| search|dwnlded|evicted|| number|dwnlded|\n",
+      "\t---------------------------------------------------------------------\n",
+      "\t|      default     |   83  |   0   |   0   |   5   ||   78  |   0   |\n",
+      "\t---------------------------------------------------------------------\n",
+      ":: retrieving :: org.apache.spark#spark-submit-parent-994cb793-bb56-4b46-ad2f-b20d68529970\n",
+      "\tconfs: [default]\n",
+      "\t0 artifacts copied, 78 already retrieved (0kB/22ms)\n",
+      "24/07/21 10:51:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "\n",
+    "# let's start Spark with Spark NLP with GPU enabled. If you don't have GPUs available remove this parameter.\n",
+    "spark = sparknlp.start(gpu=True)\n",
+    "print(sparknlp.version())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's use the `loadSavedModel` functon in `AutoGGUFModel`\n",
+    "- Most params will be set automatically. They can also be set later after loading the model in `AutoGGUFModel` during runtime, so don't worry about setting them now.\n",
+    "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n",
+    "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracted 'libllama.so' to '/tmp/libllama.so'\n",
+      "Extracted 'libjllama.so' to '/tmp/libjllama.so'\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sparknlp.annotator import *\n",
+    "\n",
+    "# All these params should be identical to the original ONNX model\n",
+    "autoGGUFModel = (\n",
+    "    AutoGGUFModel.loadSavedModel(EXPORT_PATH, spark)\n",
+    "    .setInputCols(\"document\")\n",
+    "    .setOutputCol(\"completions\")\n",
+    "    .setBatchSize(4)\n",
+    "    .setNPredict(20)\n",
+    "    .setNGpuLayers(99)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "autoGGUFModel.write().overwrite().save(f\"Phi-3-mini-4k-instruct-q4_spark_nlp\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's clean up stuff we don't need anymore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf {EXPORT_PATH}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Awesome  😎 !\n",
+    "\n",
+    "This is your GGUF model from loaded and saved by Spark NLP 🚀"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 2337168\n",
+      "drwxr-xr-x 2 ducha ducha       4096 Jul 21 16:24 metadata\n",
+      "-rwxrwxr-x 1 ducha ducha 2393231072 Jul 21 16:24 Phi-3-mini-4k-instruct-q4.gguf\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls -l Phi-3-mini-4k-instruct-q4_spark_nlp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny GGUF model 😊"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INFO] build info build=3008 commit=\"1d8fca72\"\n",
+      "[INFO] system info n_threads=6 n_threads_batch=-1 total_threads=6 system_info=\"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \"\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /tmp/spark-bbad4f64-91a7-4b6e-8242-7f91e6abca54/userFiles-f7d4e4e9-c02d-46e4-81b5-bf5a26d70930/Phi-3-mini-4k-instruct-q4.gguf (version GGUF V3 (latest))\n",
+      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
+      "llama_model_loader: - kv   0:                       general.architecture str              = phi3\n",
+      "llama_model_loader: - kv   1:                               general.name str              = Phi3\n",
+      "llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096\n",
+      "llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072\n",
+      "llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192\n",
+      "llama_model_loader: - kv   5:                           phi3.block_count u32              = 32\n",
+      "llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32\n",
+      "llama_model_loader: - kv   7:               phi3.attention.head_count_kv u32              = 32\n",
+      "llama_model_loader: - kv   8:      phi3.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
+      "llama_model_loader: - kv   9:                  phi3.rope.dimension_count u32              = 96\n",
+      "llama_model_loader: - kv  10:                          general.file_type u32              = 15\n",
+      "llama_model_loader: - kv  11:                       tokenizer.ggml.model str              = llama\n",
+      "llama_model_loader: - kv  12:                         tokenizer.ggml.pre str              = default\n",
+      "llama_model_loader: - kv  13:                      tokenizer.ggml.tokens arr[str,32064]   = [\"<unk>\", \"<s>\", \"</s>\", \"<0x00>\", \"<...\n",
+      "llama_model_loader: - kv  14:                      tokenizer.ggml.scores arr[f32,32064]   = [0.000000, 0.000000, 0.000000, 0.0000...\n",
+      "llama_model_loader: - kv  15:                  tokenizer.ggml.token_type arr[i32,32064]   = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n",
+      "llama_model_loader: - kv  16:                tokenizer.ggml.bos_token_id u32              = 1\n",
+      "llama_model_loader: - kv  17:                tokenizer.ggml.eos_token_id u32              = 32000\n",
+      "llama_model_loader: - kv  18:            tokenizer.ggml.unknown_token_id u32              = 0\n",
+      "llama_model_loader: - kv  19:            tokenizer.ggml.padding_token_id u32              = 32000\n",
+      "llama_model_loader: - kv  20:               tokenizer.ggml.add_bos_token bool             = true\n",
+      "llama_model_loader: - kv  21:               tokenizer.ggml.add_eos_token bool             = false\n",
+      "llama_model_loader: - kv  22:                    tokenizer.chat_template str              = {{ bos_token }}{% for message in mess...\n",
+      "llama_model_loader: - kv  23:               general.quantization_version u32              = 2\n",
+      "llama_model_loader: - type  f32:   65 tensors\n",
+      "llama_model_loader: - type q4_K:   81 tensors\n",
+      "llama_model_loader: - type q5_K:   32 tensors\n",
+      "llama_model_loader: - type q6_K:   17 tensors\n",
+      "llm_load_vocab: special tokens definition check successful ( 323/32064 ).\n",
+      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
+      "llm_load_print_meta: arch             = phi3\n",
+      "llm_load_print_meta: vocab type       = SPM\n",
+      "llm_load_print_meta: n_vocab          = 32064\n",
+      "llm_load_print_meta: n_merges         = 0\n",
+      "llm_load_print_meta: n_ctx_train      = 4096\n",
+      "llm_load_print_meta: n_embd           = 3072\n",
+      "llm_load_print_meta: n_head           = 32\n",
+      "llm_load_print_meta: n_head_kv        = 32\n",
+      "llm_load_print_meta: n_layer          = 32\n",
+      "llm_load_print_meta: n_rot            = 96\n",
+      "llm_load_print_meta: n_embd_head_k    = 96\n",
+      "llm_load_print_meta: n_embd_head_v    = 96\n",
+      "llm_load_print_meta: n_gqa            = 1\n",
+      "llm_load_print_meta: n_embd_k_gqa     = 3072\n",
+      "llm_load_print_meta: n_embd_v_gqa     = 3072\n",
+      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
+      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
+      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+      "llm_load_print_meta: f_logit_scale    = 0.0e+00\n",
+      "llm_load_print_meta: n_ff             = 8192\n",
+      "llm_load_print_meta: n_expert         = 0\n",
+      "llm_load_print_meta: n_expert_used    = 0\n",
+      "llm_load_print_meta: causal attn      = 1\n",
+      "llm_load_print_meta: pooling type     = 0\n",
+      "llm_load_print_meta: rope type        = 2\n",
+      "llm_load_print_meta: rope scaling     = linear\n",
+      "llm_load_print_meta: freq_base_train  = 10000.0\n",
+      "llm_load_print_meta: freq_scale_train = 1\n",
+      "llm_load_print_meta: n_yarn_orig_ctx  = 4096\n",
+      "llm_load_print_meta: rope_finetuned   = unknown\n",
+      "llm_load_print_meta: ssm_d_conv       = 0\n",
+      "llm_load_print_meta: ssm_d_inner      = 0\n",
+      "llm_load_print_meta: ssm_d_state      = 0\n",
+      "llm_load_print_meta: ssm_dt_rank      = 0\n",
+      "llm_load_print_meta: model type       = 3B\n",
+      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
+      "llm_load_print_meta: model params     = 3.82 B\n",
+      "llm_load_print_meta: model size       = 2.23 GiB (5.01 BPW) \n",
+      "llm_load_print_meta: general.name     = Phi3\n",
+      "llm_load_print_meta: BOS token        = 1 '<s>'\n",
+      "llm_load_print_meta: EOS token        = 32000 '<|endoftext|>'\n",
+      "llm_load_print_meta: UNK token        = 0 '<unk>'\n",
+      "llm_load_print_meta: PAD token        = 32000 '<|endoftext|>'\n",
+      "llm_load_print_meta: LF token         = 13 '<0x0A>'\n",
+      "llm_load_print_meta: EOT token        = 32007 '<|end|>'\n",
+      "ggml_cuda_init: failed to initialize CUDA: unknown error\n",
+      "llm_load_tensors: ggml ctx size =    0.11 MiB\n",
+      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
+      "llm_load_tensors: offloading non-repeating layers to GPU\n",
+      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
+      "llm_load_tensors:        CPU buffer size =  2281.66 MiB\n",
+      "...........................................................................................\n",
+      "llama_new_context_with_model: n_ctx      = 512\n",
+      "llama_new_context_with_model: n_batch    = 512\n",
+      "llama_new_context_with_model: n_ubatch   = 512\n",
+      "llama_new_context_with_model: flash_attn = 0\n",
+      "llama_new_context_with_model: freq_base  = 10000.0\n",
+      "llama_new_context_with_model: freq_scale = 1\n",
+      "ggml_cuda_host_malloc: failed to allocate 192.00 MiB of pinned memory: unknown error\n",
+      "llama_kv_cache_init:        CPU KV buffer size =   192.00 MiB\n",
+      "llama_new_context_with_model: KV self size  =  192.00 MiB, K (f16):   96.00 MiB, V (f16):   96.00 MiB\n",
+      "ggml_cuda_host_malloc: failed to allocate 0.61 MiB of pinned memory: unknown error\n",
+      "llama_new_context_with_model:        CPU  output buffer size =     0.61 MiB\n",
+      "ggml_cuda_host_malloc: failed to allocate 83.01 MiB of pinned memory: unknown error\n",
+      "llama_new_context_with_model:  CUDA_Host compute buffer size =    83.01 MiB\n",
+      "llama_new_context_with_model: graph nodes  = 1286\n",
+      "llama_new_context_with_model: graph splits = 1\n",
+      "[Stage 5:>                                                          (0 + 1) / 1]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INFO] initializing slots n_slots=4\n",
+      "[INFO] new slot id_slot=0 n_ctx_slot=128\n",
+      "[INFO] new slot id_slot=1 n_ctx_slot=128\n",
+      "[INFO] new slot id_slot=2 n_ctx_slot=128\n",
+      "[INFO] new slot id_slot=3 n_ctx_slot=128\n",
+      "[INFO] model loaded\n",
+      "[INFO] chat template chat_example=\"<|system|>\\nYou are a helpful assistant<|end|>\\n<|user|>\\nHello<|end|>\\n<|assistant|>\\nHi there<|end|>\\n<|user|>\\nHow are you?<|end|>\\n<|assistant|>\\n\" built_in=true\n",
+      "[INFO] all slots are idle\n",
+      "[INFO] slot is processing task id_slot=0 id_task=0\n",
+      "[INFO] kv cache rm [p0, end) id_slot=0 id_task=0 p0=0\n",
+      "[INFO] prompt eval time     =     318.87 ms /     5 tokens (   63.77 ms per token,    15.68 tokens per second) id_slot=0 id_task=0 t_prompt_processing=318.873 n_prompt_tokens_processed=5 t_token=63.7746 n_tokens_second=15.680223788153905\n",
+      "[INFO] generation eval time =    4136.03 ms /    20 runs   (  206.80 ms per token,     4.84 tokens per second) id_slot=0 id_task=0 t_token_generation=4136.032 n_decoded=20 t_token=206.8016 n_tokens_second=4.835552529574239\n",
+      "[INFO]           total time =    4454.90 ms id_slot=0 id_task=0 t_prompt_processing=318.873 t_token_generation=4136.032 t_total=4454.905\n",
+      "[INFO] slot released id_slot=0 id_task=0 n_ctx=512 n_past=24 n_system_tokens=0 n_cache_tokens=0 truncated=false\n",
+      "[INFO] all slots are idle\n",
+      "+------------------------------------------------------------------------------------------------------------+\n",
+      "|completions                                                                                                 |\n",
+      "+------------------------------------------------------------------------------------------------------------+\n",
+      "|[{document, 0, 78, 384,000 kilometers away from the Earth. Use scientific notation to express this, {}, []}]|\n",
+      "+------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "from sparknlp.base import *\n",
+    "from sparknlp.annotator import *\n",
+    "from pyspark.ml import Pipeline\n",
+    "\n",
+    "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n",
+    "\n",
+    "auto_gguf_model = AutoGGUFModel.load(\"Phi-3-mini-4k-instruct-q4_spark_nlp\")\n",
+    "\n",
+    "pipeline = Pipeline().setStages([document_assembler, auto_gguf_model])\n",
+    "\n",
+    "data = spark.createDataFrame([[\"The moon is \"]]).toDF(\"text\")\n",
+    "\n",
+    "result = pipeline.fit(data).transform(data)\n",
+    "result.select(\"completions\").show(truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That's it! You can now go wild and use hundreds of GGUF models from HuggingFace 🤗 in Spark NLP 🚀\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/project/Dependencies.scala b/project/Dependencies.scala
index e44934a3e06d47..202fb048044dea 100644
--- a/project/Dependencies.scala
+++ b/project/Dependencies.scala
@@ -128,5 +128,10 @@ object Dependencies {
   val azureIdentity = "com.azure" % "azure-identity" % azureIdentityVersion % Provided
   val azureStorage = "com.azure" % "azure-storage-blob" % azureStorageVersion % Provided
 
+  val llamaCppVersion = "0.1.1-rc2"
+  val llamaCppCPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-cpu" % llamaCppVersion
+  val llamaCppGPU = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-gpu" % llamaCppVersion
+  val llamaCppSilicon = "com.johnsnowlabs.nlp" %% "jsl-llamacpp-silicon" % llamaCppVersion
+
   /** ------- Dependencies end  ------- */
 }
diff --git a/python/setup.py b/python/setup.py
index 1a41299ee3ab5c..cebe55084f4427 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -41,7 +41,7 @@
     # project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
 
-    version='5.4.2',  # Required
+    version='5.5.0',  # Required
 
     # This is a one-line description or tagline of what your project does. This
     # corresponds to the 'Summary' metadata field:
diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py
index 69ee444e14d00a..e9c3984c21ecc1 100644
--- a/python/sparknlp/annotator/seq2seq/__init__.py
+++ b/python/sparknlp/annotator/seq2seq/__init__.py
@@ -21,9 +21,10 @@
 from sparknlp.annotator.seq2seq.m2m100_transformer import *
 from sparknlp.annotator.seq2seq.phi2_transformer import *
 from sparknlp.annotator.seq2seq.mistral_transformer import *
+from sparknlp.annotator.seq2seq.auto_gguf_model import *
 from sparknlp.annotator.seq2seq.phi3_transformer import *
 from sparknlp.annotator.seq2seq.nllb_transformer import *
 from sparknlp.annotator.seq2seq.cpm_transformer import *
 from sparknlp.annotator.seq2seq.qwen_transformer import *
 from sparknlp.annotator.seq2seq.starcoder_transformer import *
-from sparknlp.annotator.seq2seq.llama3_transformer import *
\ No newline at end of file
+from sparknlp.annotator.seq2seq.llama3_transformer import *
diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
new file mode 100755
index 00000000000000..4441d1d8c6e88b
--- /dev/null
+++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
@@ -0,0 +1,804 @@
+#  Copyright 2017-2023 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the AutoGGUFModel."""
+from typing import List, Dict
+
+from sparknlp.common import *
+
+
+class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate):
+    """
+    Annotator that uses the llama.cpp library to generate text completions with large language
+    models.
+
+    For settable parameters, and their explanations, see the parameters of this class and refer to
+    the llama.cpp documentation of
+    `server.cpp <https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server>`__
+    for more information.
+
+    If the parameters are not set, the annotator will default to use the parameters provided by
+    the model.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> auto_gguf_model = AutoGGUFModel.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("completions")
+
+    The default model is ``"gguf-phi3-mini-4k-instruct-q4"``, if no name is provided.
+
+    For extended examples of usage, see the
+    `AutoGGUFModelTest <https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala>`__
+    and the
+    `example notebook <https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb>`__.
+
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    nThreads
+        Set the number of threads to use during generation
+    nThreadsDraft
+        Set the number of threads to use during draft generation
+    nThreadsBatch
+        Set the number of threads to use during batch and prompt processing
+    nThreadsBatchDraft
+        Set the number of threads to use during batch and prompt processing
+    nCtx
+        Set the size of the prompt context
+    nBatch
+        Set the logical batch size for prompt processing (must be >=32 to use BLAS)
+    nUbatch
+        Set the physical batch size for prompt processing (must be >=32 to use BLAS)
+    nDraft
+        Set the number of tokens to draft for speculative decoding
+    nChunks
+        Set the maximal number of chunks to process
+    nSequences
+        Set the number of sequences to decode
+    pSplit
+        Set the speculative decoding split probability
+    nGpuLayers
+        Set the number of layers to store in VRAM (-1 - use default)
+    nGpuLayersDraft
+        Set the number of layers to store in VRAM for the draft model (-1 - use default)
+    gpuSplitMode
+        Set how to split the model across GPUs
+    mainGpu
+        Set the main GPU that is used for scratch and small tensors.
+    tensorSplit
+        Set how split tensors should be distributed across GPUs
+    grpAttnN
+        Set the group-attention factor
+    grpAttnW
+        Set the group-attention width
+    ropeFreqBase
+        Set the RoPE base frequency, used by NTK-aware scaling
+    ropeFreqScale
+        Set the RoPE frequency scaling factor, expands context by a factor of 1/N
+    yarnExtFactor
+        Set the YaRN extrapolation mix factor
+    yarnAttnFactor
+        Set the YaRN scale sqrt(t) or attention magnitude
+    yarnBetaFast
+        Set the YaRN low correction dim or beta
+    yarnBetaSlow
+        Set the YaRN high correction dim or alpha
+    yarnOrigCtx
+        Set the YaRN original context size of model
+    defragmentationThreshold
+        Set the KV cache defragmentation threshold
+    numaStrategy
+        Set optimization strategies that help on some NUMA systems (if available)
+    ropeScalingType
+        Set the RoPE frequency scaling method, defaults to linear unless specified by the model
+    poolingType
+        Set the pooling type for embeddings, use model default if unspecified
+    modelDraft
+        Set the draft model for speculative decoding
+    modelAlias
+        Set a model alias
+    lookupCacheStaticFilePath
+        Set path to static lookup cache to use for lookup decoding (not updated by generation)
+    lookupCacheDynamicFilePath
+        Set path to dynamic lookup cache to use for lookup decoding (updated by generation)
+    embedding
+        Whether to load model with embedding support
+    flashAttention
+        Whether to enable Flash Attention
+    inputPrefixBos
+        Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string
+    useMmap
+        Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
+    useMlock
+        Whether to force the system to keep model in RAM rather than swapping or compressing
+    noKvOffload
+        Whether to disable KV offload
+    systemPrompt
+        Set a system prompt to use
+    chatTemplate
+        The chat template to use
+    inputPrefix
+        Set the prompt to start generation with
+    inputSuffix
+        Set a suffix for infilling
+    cachePrompt
+        Whether to remember the prompt to avoid reprocessing it
+    nPredict
+        Set the number of tokens to predict
+    topK
+        Set top-k sampling
+    topP
+        Set top-p sampling
+    minP
+        Set min-p sampling
+    tfsZ
+        Set tail free sampling, parameter z
+    typicalP
+        Set locally typical sampling, parameter p
+    temperature
+        Set the temperature
+    dynatempRange
+        Set the dynamic temperature range
+    dynatempExponent
+        Set the dynamic temperature exponent
+    repeatLastN
+        Set the last n tokens to consider for penalties
+    repeatPenalty
+        Set the penalty of repeated sequences of tokens
+    frequencyPenalty
+        Set the repetition alpha frequency penalty
+    presencePenalty
+        Set the repetition alpha presence penalty
+    miroStat
+        Set MiroStat sampling strategies.
+    mirostatTau
+        Set the MiroStat target entropy, parameter tau
+    mirostatEta
+        Set the MiroStat learning rate, parameter eta
+    penalizeNl
+        Whether to penalize newline tokens
+    nKeep
+        Set the number of tokens to keep from the initial prompt
+    seed
+        Set the RNG seed
+    nProbs
+        Set the amount top tokens probabilities to output if greater than 0.
+    minKeep
+        Set the amount of tokens the samplers should return at least (0 = disabled)
+    grammar
+        Set BNF-like grammar to constrain generations
+    penaltyPrompt
+        Override which part of the prompt is penalized for repetition.
+    ignoreEos
+        Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)
+    disableTokenIds
+        Set the token ids to disable in the completion
+    stopStrings
+        Set strings upon seeing which token generation is stopped
+    samplers
+        Set which samplers to use for token generation in the given order
+    useChatTemplate
+        Set whether or not generate should apply a chat template
+
+
+    Notes
+    -----
+    To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
+    the number of GPU layers with the `setNGpuLayers` method.
+
+    When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
+    according to your hardware to avoid out-of-memory errors.
+
+    References
+    ----------
+    - `Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
+     <https://arxiv.org/abs/1910.13461>`__
+    - https://github.com/pytorch/fairseq
+
+    **Paper Abstract:**
+    *We present BART, a denoising autoencoder for pretraining sequence-to-sequence models.
+    BART is trained by (1) corrupting text with an arbitrary noising function, and (2)
+    learning a model to reconstruct the original text. It uses a standard Tranformer-based
+    neural machine translation architecture which, despite its simplicity, can be seen as
+    generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder),
+    and many other more recent pretraining schemes. We evaluate a number of noising approaches,
+    finding the best performance by both randomly shuffling the order of the original sentences
+    and using a novel in-filling scheme, where spans of text are replaced with a single mask token.
+    BART is particularly effective when fine tuned for text generation but also works well for
+    comprehension tasks. It matches the performance of RoBERTa with comparable training resources
+    on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue,
+    question answering, and summarization tasks, with gains of up to 6 ROUGE. BART also provides
+    a 1.1 BLEU increase over a back-translation system for machine translation, with only target
+    language pretraining. We also report ablation experiments that replicate other pretraining
+    schemes within the BART framework, to better measure which factors most influence end-task performance.*
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> document = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> autoGGUFModel = AutoGGUFModel.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("completions") \\
+    ...     .setBatchSize(4) \\
+    ...     .setNPredict(20) \\
+    ...     .setNGpuLayers(99) \\
+    ...     .setTemperature(0.4) \\
+    ...     .setTopK(40) \\
+    ...     .setTopP(0.9) \\
+    ...     .setPenalizeNl(True)
+    >>> pipeline = Pipeline().setStages([document, autoGGUFModel])
+    >>> data = spark.createDataFrame([["Hello, I am a"]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("completions").show(truncate = False)
+    +-----------------------------------------------------------------------------------------------------------------------------------+
+    |completions                                                                                                                        |
+    +-----------------------------------------------------------------------------------------------------------------------------------+
+    |[{document, 0, 78,  new user.  I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]|
+    +-----------------------------------------------------------------------------------------------------------------------------------+
+    """
+
+    name = "AutoGGUFModel"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    # -------- MODEl PARAMETERS --------
+    nThreads = Param(Params._dummy(), "nThreads", "Set the number of threads to use during generation",
+                     typeConverter=TypeConverters.toInt)
+    nThreadsDraft = Param(Params._dummy(), "nThreadsDraft", "Set the number of threads to use during draft generation",
+                          typeConverter=TypeConverters.toInt)
+    nThreadsBatch = Param(Params._dummy(), "nThreadsBatch",
+                          "Set the number of threads to use during batch and prompt processing",
+                          typeConverter=TypeConverters.toInt)
+    nThreadsBatchDraft = Param(Params._dummy(), "nThreadsBatchDraft",
+                               "Set the number of threads to use during batch and prompt processing",
+                               typeConverter=TypeConverters.toInt)
+    nCtx = Param(Params._dummy(), "nCtx", "Set the size of the prompt context", typeConverter=TypeConverters.toInt)
+    nBatch = Param(Params._dummy(), "nBatch",
+                   "Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
+                   typeConverter=TypeConverters.toInt)
+    nUbatch = Param(Params._dummy(), "nUbatch",
+                    "Set the physical batch size for prompt processing (must be >=32 to use BLAS)",
+                    typeConverter=TypeConverters.toInt)
+    nDraft = Param(Params._dummy(), "nDraft", "Set the number of tokens to draft for speculative decoding",
+                   typeConverter=TypeConverters.toInt)
+    nChunks = Param(Params._dummy(), "nChunks", "Set the maximal number of chunks to process",
+                    typeConverter=TypeConverters.toInt)
+    nSequences = Param(Params._dummy(), "nSequences", "Set the number of sequences to decode",
+                       typeConverter=TypeConverters.toInt)
+    pSplit = Param(Params._dummy(), "pSplit", "Set the speculative decoding split probability",
+                   typeConverter=TypeConverters.toFloat)
+    nGpuLayers = Param(Params._dummy(), "nGpuLayers", "Set the number of layers to store in VRAM (-1 - use default)",
+                       typeConverter=TypeConverters.toInt)
+    nGpuLayersDraft = Param(Params._dummy(), "nGpuLayersDraft",
+                            "Set the number of layers to store in VRAM for the draft model (-1 - use default)",
+                            typeConverter=TypeConverters.toInt)
+    # Set how to split the model across GPUs
+    #
+    #   - NONE: No GPU split
+    #   - LAYER: Split the model across GPUs by layer
+    #   - ROW: Split the model across GPUs by rows
+    gpuSplitMode = Param(Params._dummy(), "gpuSplitMode", "Set how to split the model across GPUs",
+                         typeConverter=TypeConverters.toString)
+    mainGpu = Param(Params._dummy(), "mainGpu", "Set the main GPU that is used for scratch and small tensors.",
+                    typeConverter=TypeConverters.toInt)
+    tensorSplit = Param(Params._dummy(), "tensorSplit", "Set how split tensors should be distributed across GPUs",
+                        typeConverter=TypeConverters.toListFloat)
+    grpAttnN = Param(Params._dummy(), "grpAttnN", "Set the group-attention factor", typeConverter=TypeConverters.toInt)
+    grpAttnW = Param(Params._dummy(), "grpAttnW", "Set the group-attention width", typeConverter=TypeConverters.toInt)
+    ropeFreqBase = Param(Params._dummy(), "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling",
+                         typeConverter=TypeConverters.toFloat)
+    ropeFreqScale = Param(Params._dummy(), "ropeFreqScale",
+                          "Set the RoPE frequency scaling factor, expands context by a factor of 1/N",
+                          typeConverter=TypeConverters.toFloat)
+    yarnExtFactor = Param(Params._dummy(), "yarnExtFactor", "Set the YaRN extrapolation mix factor",
+                          typeConverter=TypeConverters.toFloat)
+    yarnAttnFactor = Param(Params._dummy(), "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude",
+                           typeConverter=TypeConverters.toFloat)
+    yarnBetaFast = Param(Params._dummy(), "yarnBetaFast", "Set the YaRN low correction dim or beta",
+                         typeConverter=TypeConverters.toFloat)
+    yarnBetaSlow = Param(Params._dummy(), "yarnBetaSlow", "Set the YaRN high correction dim or alpha",
+                         typeConverter=TypeConverters.toFloat)
+    yarnOrigCtx = Param(Params._dummy(), "yarnOrigCtx", "Set the YaRN original context size of model",
+                        typeConverter=TypeConverters.toInt)
+    defragmentationThreshold = Param(Params._dummy(), "defragmentationThreshold",
+                                     "Set the KV cache defragmentation threshold", typeConverter=TypeConverters.toFloat)
+    # Set optimization strategies that help on some NUMA systems (if available)
+    #
+    # Available Strategies:
+    #
+    #   - DISABLED: No NUMA optimizations
+    #   - DISTRIBUTE: Spread execution evenly over all
+    #   - ISOLATE: Only spawn threads on CPUs on the node that execution started on
+    #   - NUMA_CTL: Use the CPU map provided by numactl
+    #   - MIRROR: Mirrors the model across NUMA nodes
+    numaStrategy = Param(Params._dummy(), "numaStrategy",
+                         "Set optimization strategies that help on some NUMA systems (if available)",
+                         typeConverter=TypeConverters.toString)
+    # Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+    #
+    #   - UNSPECIFIED: Don't use any scaling
+    #   - LINEAR: Linear scaling
+    #   - YARN: YaRN RoPE scaling
+    ropeScalingType = Param(Params._dummy(), "ropeScalingType",
+                            "Set the RoPE frequency scaling method, defaults to linear unless specified by the model",
+                            typeConverter=TypeConverters.toString)
+    # Set the pooling type for embeddings, use model default if unspecified
+    #
+    #   - 0 UNSPECIFIED: Don't use any pooling
+    #   - 1 MEAN: Mean Pooling
+    #   - 2 CLS: CLS Pooling
+    poolingType = Param(Params._dummy(), "poolingType",
+                        "Set the pooling type for embeddings, use model default if unspecified",
+                        typeConverter=TypeConverters.toString)
+    modelDraft = Param(Params._dummy(), "modelDraft", "Set the draft model for speculative decoding",
+                       typeConverter=TypeConverters.toString)
+    modelAlias = Param(Params._dummy(), "modelAlias", "Set a model alias", typeConverter=TypeConverters.toString)
+    lookupCacheStaticFilePath = Param(Params._dummy(), "lookupCacheStaticFilePath",
+                                      "Set path to static lookup cache to use for lookup decoding (not updated by generation)",
+                                      typeConverter=TypeConverters.toString)
+    lookupCacheDynamicFilePath = Param(Params._dummy(), "lookupCacheDynamicFilePath",
+                                       "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+                                       typeConverter=TypeConverters.toString)
+    # loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
+    embedding = Param(Params._dummy(), "embedding", "Whether to load model with embedding support",
+                      typeConverter=TypeConverters.toBoolean)
+    flashAttention = Param(Params._dummy(), "flashAttention", "Whether to enable Flash Attention",
+                           typeConverter=TypeConverters.toBoolean)
+    inputPrefixBos = Param(Params._dummy(), "inputPrefixBos",
+                           "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string",
+                           typeConverter=TypeConverters.toBoolean)
+    useMmap = Param(Params._dummy(), "useMmap",
+                    "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
+                    typeConverter=TypeConverters.toBoolean)
+    useMlock = Param(Params._dummy(), "useMlock",
+                     "Whether to force the system to keep model in RAM rather than swapping or compressing",
+                     typeConverter=TypeConverters.toBoolean)
+    noKvOffload = Param(Params._dummy(), "noKvOffload", "Whether to disable KV offload",
+                        typeConverter=TypeConverters.toBoolean)
+    systemPrompt = Param(Params._dummy(), "systemPrompt", "Set a system prompt to use",
+                         typeConverter=TypeConverters.toString)
+    chatTemplate = Param(Params._dummy(), "chatTemplate", "The chat template to use",
+                         typeConverter=TypeConverters.toString)
+
+    # -------- INFERENCE PARAMETERS --------
+    inputPrefix = Param(Params._dummy(), "inputPrefix", "Set the prompt to start generation with",
+                        typeConverter=TypeConverters.toString)
+    inputSuffix = Param(Params._dummy(), "inputSuffix", "Set a suffix for infilling",
+                        typeConverter=TypeConverters.toString)
+    cachePrompt = Param(Params._dummy(), "cachePrompt", "Whether to remember the prompt to avoid reprocessing it",
+                        typeConverter=TypeConverters.toBoolean)
+    nPredict = Param(Params._dummy(), "nPredict", "Set the number of tokens to predict",
+                     typeConverter=TypeConverters.toInt)
+    topK = Param(Params._dummy(), "topK", "Set top-k sampling", typeConverter=TypeConverters.toInt)
+    topP = Param(Params._dummy(), "topP", "Set top-p sampling", typeConverter=TypeConverters.toFloat)
+    minP = Param(Params._dummy(), "minP", "Set min-p sampling", typeConverter=TypeConverters.toFloat)
+    tfsZ = Param(Params._dummy(), "tfsZ", "Set tail free sampling, parameter z", typeConverter=TypeConverters.toFloat)
+    typicalP = Param(Params._dummy(), "typicalP", "Set locally typical sampling, parameter p",
+                     typeConverter=TypeConverters.toFloat)
+    temperature = Param(Params._dummy(), "temperature", "Set the temperature", typeConverter=TypeConverters.toFloat)
+    dynamicTemperatureRange = Param(Params._dummy(), "dynatempRange", "Set the dynamic temperature range",
+                                    typeConverter=TypeConverters.toFloat)
+    dynamicTemperatureExponent = Param(Params._dummy(), "dynatempExponent", "Set the dynamic temperature exponent",
+                                       typeConverter=TypeConverters.toFloat)
+    repeatLastN = Param(Params._dummy(), "repeatLastN", "Set the last n tokens to consider for penalties",
+                        typeConverter=TypeConverters.toInt)
+    repeatPenalty = Param(Params._dummy(), "repeatPenalty", "Set the penalty of repeated sequences of tokens",
+                          typeConverter=TypeConverters.toFloat)
+    frequencyPenalty = Param(Params._dummy(), "frequencyPenalty", "Set the repetition alpha frequency penalty",
+                             typeConverter=TypeConverters.toFloat)
+    presencePenalty = Param(Params._dummy(), "presencePenalty", "Set the repetition alpha presence penalty",
+                            typeConverter=TypeConverters.toFloat)
+    miroStat = Param(Params._dummy(), "miroStat", "Set MiroStat sampling strategies.",
+                     typeConverter=TypeConverters.toString)
+    miroStatTau = Param(Params._dummy(), "mirostatTau", "Set the MiroStat target entropy, parameter tau",
+                        typeConverter=TypeConverters.toFloat)
+    miroStatEta = Param(Params._dummy(), "mirostatEta", "Set the MiroStat learning rate, parameter eta",
+                        typeConverter=TypeConverters.toFloat)
+    penalizeNl = Param(Params._dummy(), "penalizeNl", "Whether to penalize newline tokens",
+                       typeConverter=TypeConverters.toBoolean)
+    nKeep = Param(Params._dummy(), "nKeep", "Set the number of tokens to keep from the initial prompt",
+                  typeConverter=TypeConverters.toInt)
+    seed = Param(Params._dummy(), "seed", "Set the RNG seed", typeConverter=TypeConverters.toInt)
+    nProbs = Param(Params._dummy(), "nProbs", "Set the amount top tokens probabilities to output if greater than 0.",
+                   typeConverter=TypeConverters.toInt)
+    minKeep = Param(Params._dummy(), "minKeep",
+                    "Set the amount of tokens the samplers should return at least (0 = disabled)",
+                    typeConverter=TypeConverters.toInt)
+    grammar = Param(Params._dummy(), "grammar", "Set BNF-like grammar to constrain generations",
+                    typeConverter=TypeConverters.toString)
+    penaltyPrompt = Param(Params._dummy(), "penaltyPrompt",
+                          "Override which part of the prompt is penalized for repetition.",
+                          typeConverter=TypeConverters.toString)
+    ignoreEos = Param(Params._dummy(), "ignoreEos",
+                      "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)",
+                      typeConverter=TypeConverters.toBoolean)
+    disableTokenIds = Param(Params._dummy(), "disableTokenIds", "Set the token ids to disable in the completion",
+                            typeConverter=TypeConverters.toListInt)
+    stopStrings = Param(Params._dummy(), "stopStrings", "Set strings upon seeing which token generation is stopped",
+                        typeConverter=TypeConverters.toListString)
+    samplers = Param(Params._dummy(), "samplers", "Set which samplers to use for token generation in the given order",
+                     typeConverter=TypeConverters.toListString)
+    useChatTemplate = Param(Params._dummy(), "useChatTemplate",
+                            "Set whether or not generate should apply a chat template",
+                            typeConverter=TypeConverters.toBoolean)
+
+    # -------- MODEL SETTERS --------
+    def setNThreads(self, nThreads: int):
+        """Set the number of threads to use during generation"""
+        return self._set(nThreads=nThreads)
+
+    def setNThreadsDraft(self, nThreadsDraft: int):
+        """Set the number of threads to use during draft generation"""
+        return self._set(nThreadsDraft=nThreadsDraft)
+
+    def setNThreadsBatch(self, nThreadsBatch: int):
+        """Set the number of threads to use during batch and prompt processing"""
+        return self._set(nThreadsBatch=nThreadsBatch)
+
+    def setNThreadsBatchDraft(self, nThreadsBatchDraft: int):
+        """Set the number of threads to use during batch and prompt processing"""
+        return self._set(nThreadsBatchDraft=nThreadsBatchDraft)
+
+    def setNCtx(self, nCtx: int):
+        """Set the size of the prompt context"""
+        return self._set(nCtx=nCtx)
+
+    def setNBatch(self, nBatch: int):
+        """Set the logical batch size for prompt processing (must be >=32 to use BLAS)"""
+        return self._set(nBatch=nBatch)
+
+    def setNUbatch(self, nUbatch: int):
+        """Set the physical batch size for prompt processing (must be >=32 to use BLAS)"""
+        return self._set(nUbatch=nUbatch)
+
+    def setNDraft(self, nDraft: int):
+        """Set the number of tokens to draft for speculative decoding"""
+        return self._set(nDraft=nDraft)
+
+    def setNChunks(self, nChunks: int):
+        """Set the maximal number of chunks to process"""
+        return self._set(nChunks=nChunks)
+
+    def setNSequences(self, nSequences: int):
+        """Set the number of sequences to decode"""
+        return self._set(nSequences=nSequences)
+
+    def setPSplit(self, pSplit: float):
+        """Set the speculative decoding split probability"""
+        return self._set(pSplit=pSplit)
+
+    def setNGpuLayers(self, nGpuLayers: int):
+        """Set the number of layers to store in VRAM (-1 - use default)"""
+        return self._set(nGpuLayers=nGpuLayers)
+
+    def setNGpuLayersDraft(self, nGpuLayersDraft: int):
+        """Set the number of layers to store in VRAM for the draft model (-1 - use default)"""
+        return self._set(nGpuLayersDraft=nGpuLayersDraft)
+
+    def setGpuSplitMode(self, gpuSplitMode: str):
+        """Set how to split the model across GPUs"""
+        return self._set(gpuSplitMode=gpuSplitMode)
+
+    def setMainGpu(self, mainGpu: int):
+        """Set the main GPU that is used for scratch and small tensors."""
+        return self._set(mainGpu=mainGpu)
+
+    def setTensorSplit(self, tensorSplit: List[float]):
+        """Set how split tensors should be distributed across GPUs"""
+        return self._set(tensorSplit=tensorSplit)
+
+    def setGrpAttnN(self, grpAttnN: int):
+        """Set the group-attention factor"""
+        return self._set(grpAttnN=grpAttnN)
+
+    def setGrpAttnW(self, grpAttnW: int):
+        """Set the group-attention width"""
+        return self._set(grpAttnW=grpAttnW)
+
+    def setRopeFreqBase(self, ropeFreqBase: float):
+        """Set the RoPE base frequency, used by NTK-aware scaling"""
+        return self._set(ropeFreqBase=ropeFreqBase)
+
+    def setRopeFreqScale(self, ropeFreqScale: float):
+        """Set the RoPE frequency scaling factor, expands context by a factor of 1/N"""
+        return self._set(ropeFreqScale=ropeFreqScale)
+
+    def setYarnExtFactor(self, yarnExtFactor: float):
+        """Set the YaRN extrapolation mix factor"""
+        return self._set(yarnExtFactor=yarnExtFactor)
+
+    def setYarnAttnFactor(self, yarnAttnFactor: float):
+        """Set the YaRN scale sqrt(t) or attention magnitude"""
+        return self._set(yarnAttnFactor=yarnAttnFactor)
+
+    def setYarnBetaFast(self, yarnBetaFast: float):
+        """Set the YaRN low correction dim or beta"""
+        return self._set(yarnBetaFast=yarnBetaFast)
+
+    def setYarnBetaSlow(self, yarnBetaSlow: float):
+        """Set the YaRN high correction dim or alpha"""
+        return self._set(yarnBetaSlow=yarnBetaSlow)
+
+    def setYarnOrigCtx(self, yarnOrigCtx: int):
+        """Set the YaRN original context size of model"""
+        return self._set(yarnOrigCtx=yarnOrigCtx)
+
+    def setDefragmentationThreshold(self, defragmentationThreshold: float):
+        """Set the KV cache defragmentation threshold"""
+        return self._set(defragmentationThreshold=defragmentationThreshold)
+
+    def setNumaStrategy(self, numaStrategy: str):
+        """Set optimization strategies that help on some NUMA systems (if available)"""
+        return self._set(numaStrategy=numaStrategy)
+
+    def setRopeScalingType(self, ropeScalingType: str):
+        """Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
+        return self._set(ropeScalingType=ropeScalingType)
+
+    def setPoolingType(self, poolingType: bool):
+        """Set the pooling type for embeddings, use model default if unspecified"""
+        return self._set(poolingType=poolingType)
+
+    def setModelDraft(self, modelDraft: str):
+        """Set the draft model for speculative decoding"""
+        return self._set(modelDraft=modelDraft)
+
+    def setModelAlias(self, modelAlias: str):
+        """Set a model alias"""
+        return self._set(modelAlias=modelAlias)
+
+    def setLookupCacheStaticFilePath(self, lookupCacheStaticFilePath: str):
+        """Set path to static lookup cache to use for lookup decoding (not updated by generation)"""
+        return self._set(lookupCacheStaticFilePath=lookupCacheStaticFilePath)
+
+    def setLookupCacheDynamicFilePath(self, lookupCacheDynamicFilePath: str):
+        """Set path to dynamic lookup cache to use for lookup decoding (updated by generation)"""
+        return self._set(lookupCacheDynamicFilePath=lookupCacheDynamicFilePath)
+
+    def setEmbedding(self, embedding: bool):
+        """Whether to load model with embedding support"""
+        return self._set(embedding=embedding)
+
+    def setFlashAttention(self, flashAttention: bool):
+        """Whether to enable Flash Attention"""
+        return self._set(flashAttention=flashAttention)
+
+    def setInputPrefixBos(self, inputPrefixBos: bool):
+        """Whether to add prefix BOS to user inputs, preceding the `--in-prefix` bool"""
+        return self._set(inputPrefixBos=inputPrefixBos)
+
+    def setUseMmap(self, useMmap: bool):
+        """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
+        return self._set(useMmap=useMmap)
+
+    def setUseMlock(self, useMlock: bool):
+        """Whether to force the system to keep model in RAM rather than swapping or compressing"""
+        return self._set(useMlock=useMlock)
+
+    def setNoKvOffload(self, noKvOffload: bool):
+        """Whether to disable KV offload"""
+        return self._set(noKvOffload=noKvOffload)
+
+    def setSystemPrompt(self, systemPrompt: bool):
+        """Set a system prompt to use"""
+        return self._set(systemPrompt=systemPrompt)
+
+    def setChatTemplate(self, chatTemplate: str):
+        """The chat template to use"""
+        return self._set(chatTemplate=chatTemplate)
+
+    # -------- INFERENCE SETTERS --------
+    def setInputPrefix(self, inputPrefix: str):
+        """Set the prompt to start generation with"""
+        return self._set(inputPrefix=inputPrefix)
+
+    def setInputSuffix(self, inputSuffix: str):
+        """Set a suffix for infilling"""
+        return self._set(inputSuffix=inputSuffix)
+
+    def setCachePrompt(self, cachePrompt: bool):
+        """Whether to remember the prompt to avoid reprocessing it"""
+        return self._set(cachePrompt=cachePrompt)
+
+    def setNPredict(self, nPredict: int):
+        """Set the number of tokens to predict"""
+        return self._set(nPredict=nPredict)
+
+    def setTopK(self, topK: int):
+        """Set top-k sampling"""
+        return self._set(topK=topK)
+
+    def setTopP(self, topP: float):
+        """Set top-p sampling"""
+        return self._set(topP=topP)
+
+    def setMinP(self, minP: float):
+        """Set min-p sampling"""
+        return self._set(minP=minP)
+
+    def setTfsZ(self, tfsZ: float):
+        """Set tail free sampling, parameter z"""
+        return self._set(tfsZ=tfsZ)
+
+    def setTypicalP(self, typicalP: float):
+        """Set locally typical sampling, parameter p"""
+        return self._set(typicalP=typicalP)
+
+    def setTemperature(self, temperature: float):
+        """Set the temperature"""
+        return self._set(temperature=temperature)
+
+    def setDynamicTemperatureRange(self, dynamicTemperatureRange: float):
+        """Set the dynamic temperature range"""
+        return self._set(dynamicTemperatureRange=dynamicTemperatureRange)
+
+    def setDynamicTemperatureExponent(self, dynamicTemperatureExponent: float):
+        """Set the dynamic temperature exponent"""
+        return self._set(dynamicTemperatureExponent=dynamicTemperatureExponent)
+
+    def setRepeatLastN(self, repeatLastN: int):
+        """Set the last n tokens to consider for penalties"""
+        return self._set(repeatLastN=repeatLastN)
+
+    def setRepeatPenalty(self, repeatPenalty: float):
+        """Set the penalty of repeated sequences of tokens"""
+        return self._set(repeatPenalty=repeatPenalty)
+
+    def setFrequencyPenalty(self, frequencyPenalty: float):
+        """Set the repetition alpha frequency penalty"""
+        return self._set(frequencyPenalty=frequencyPenalty)
+
+    def setPresencePenalty(self, presencePenalty: float):
+        """Set the repetition alpha presence penalty"""
+        return self._set(presencePenalty=presencePenalty)
+
+    def setMiroStat(self, miroStat: str):
+        """Set MiroStat sampling strategies."""
+        return self._set(miroStat=miroStat)
+
+    def setMiroStatTau(self, miroStatTau: float):
+        """Set the MiroStat target entropy, parameter tau"""
+        return self._set(miroStatTau=miroStatTau)
+
+    def setMiroStatEta(self, miroStatEta: float):
+        """Set the MiroStat learning rate, parameter eta"""
+        return self._set(miroStatEta=miroStatEta)
+
+    def setPenalizeNl(self, penalizeNl: bool):
+        """Whether to penalize newline tokens"""
+        return self._set(penalizeNl=penalizeNl)
+
+    def setNKeep(self, nKeep: int):
+        """Set the number of tokens to keep from the initial prompt"""
+        return self._set(nKeep=nKeep)
+
+    def setSeed(self, seed: int):
+        """Set the RNG seed"""
+        return self._set(seed=seed)
+
+    def setNProbs(self, nProbs: int):
+        """Set the amount top tokens probabilities to output if greater than 0."""
+        return self._set(nProbs=nProbs)
+
+    def setMinKeep(self, minKeep: int):
+        """Set the amount of tokens the samplers should return at least (0 = disabled)"""
+        return self._set(minKeep=minKeep)
+
+    def setGrammar(self, grammar: bool):
+        """Set BNF-like grammar to constrain generations"""
+        return self._set(grammar=grammar)
+
+    def setPenaltyPrompt(self, penaltyPrompt: str):
+        """Override which part of the prompt is penalized for repetition."""
+        return self._set(penaltyPrompt=penaltyPrompt)
+
+    def setIgnoreEos(self, ignoreEos: bool):
+        """Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)"""
+        return self._set(ignoreEos=ignoreEos)
+
+    def setDisableTokenIds(self, disableTokenIds: List[int]):
+        """Set the token ids to disable in the completion"""
+        return self._set(disableTokenIds=disableTokenIds)
+
+    def setStopStrings(self, stopStrings: List[str]):
+        """Set strings upon seeing which token generation is stopped"""
+        return self._set(stopStrings=stopStrings)
+
+    def setSamplers(self, samplers: List[str]):
+        """Set which samplers to use for token generation in the given order"""
+        return self._set(samplers=samplers)
+
+    def setUseChatTemplate(self, useChatTemplate: bool):
+        """Set whether generate should apply a chat template"""
+        return self._set(useChatTemplate=useChatTemplate)
+
+    # -------- JAVA SETTERS --------
+    def setTokenIdBias(self, tokenIdBias: Dict[int, float]):
+        """Set token id bias"""
+        return self._call_java("setTokenIdBias", tokenIdBias)
+
+    def setTokenBias(self, tokenBias: Dict[str, float]):
+        """Set token id bias"""
+        return self._call_java("setTokenBias", tokenBias)
+
+    def setLoraAdapters(self, loraAdapters: Dict[str, float]):
+        """Set token id bias"""
+        return self._call_java("setLoraAdapters", loraAdapters)
+
+    def getMetadata(self):
+        """Gets the metadata of the model"""
+        return self._call_java("getMetadata")
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel", java_model=None):
+        super(AutoGGUFModel, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        # self._setDefault()
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        AutoGGUFModel
+            The restored model
+        """
+        from sparknlp.internal import _AutoGGUFLoader
+        jModel = _AutoGGUFLoader(folder, spark_session._jsparkSession)._java_obj
+        return AutoGGUFModel(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="gguf-phi3-mini-4k-instruct-q4", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "gguf-phi3-mini-4k-instruct-q4"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        AutoGGUFModel
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(AutoGGUFModel, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index 2c3ece653f7f3d..adf19667279fbe 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -974,6 +974,12 @@ def __init__(self, path, jspark):
         )
 
 
+class _AutoGGUFLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_AutoGGUFLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel.loadSavedModel", path, jspark)
+        
+        
 class _MxbaiEmbeddingsLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
         super(_MxbaiEmbeddingsLoader, self).__init__(
@@ -986,4 +992,3 @@ def __init__(self, path, jspark):
         super(_SnowFlakeEmbeddingsLoader, self).__init__(
             "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark
         )
-
diff --git a/python/test/annotator/seq2seq/auto_gguf_model_test.py b/python/test/annotator/seq2seq/auto_gguf_model_test.py
new file mode 100644
index 00000000000000..0c2d4a349ca806
--- /dev/null
+++ b/python/test/annotator/seq2seq/auto_gguf_model_test.py
@@ -0,0 +1,194 @@
+#  Copyright 2017-2023 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class AutoGGUFModelTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        data = (
+            self.spark.createDataFrame(
+                [
+                    ["The moons of Jupiter are "],
+                    ["Earth is "],
+                    ["The moon is "],
+                    ["The sun is "],
+                ]
+            )
+            .toDF("text")
+            .repartition(1)
+        )
+
+        document_assembler = (
+            DocumentAssembler().setInputCol("text").setOutputCol("document")
+        )
+
+        modelPath = "models/codellama-7b.Q2_K.gguf"
+        model = (
+            AutoGGUFModel.loadSavedModel(modelPath, self.spark)
+            .setInputCols("document")
+            .setOutputCol("completions")
+            .setBatchSize(4)
+            .setNPredict(20)
+            .setNGpuLayers(99)
+            .setTemperature(0.4)
+            .setTopK(40)
+            .setTopP(0.9)
+            .setPenalizeNl(True)
+        )
+
+        pipeline = Pipeline().setStages([document_assembler, model])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("completions").show(truncate=False)
+
+
+@pytest.mark.slow
+class AutoGGUFModelParametersTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        data = (
+            self.spark.createDataFrame([["The moons of Jupiter are "]])
+            .toDF("text")
+            .repartition(1)
+        )
+
+        document_assembler = (
+            DocumentAssembler().setInputCol("text").setOutputCol("document")
+        )
+
+        modelPath = "models/codellama-7b.Q2_K.gguf"
+        model = (
+            AutoGGUFModel.loadSavedModel(modelPath, self.spark)
+            .setInputCols("document")
+            .setOutputCol("completions")
+            .setBatchSize(4)
+        )
+
+        # Model Parameters
+        model.setNThreads(8)
+        model.setNThreadsDraft(8)
+        model.setNThreadsBatch(8)
+        model.setNThreadsBatchDraft(8)
+        model.setNCtx(512)
+        model.setNBatch(32)
+        model.setNUbatch(32)
+        model.setNDraft(5)
+        model.setNChunks(-1)
+        model.setNSequences(1)
+        model.setPSplit(0.1)
+        model.setNGpuLayers(99)
+        model.setNGpuLayersDraft(99)
+        model.setGpuSplitMode("NONE")
+        model.setMainGpu(0)
+        model.setTensorSplit([])
+        model.setNBeams(0)
+        model.setGrpAttnN(1)
+        model.setGrpAttnW(512)
+        model.setRopeFreqBase(1.0)
+        model.setRopeFreqScale(1.0)
+        model.setYarnExtFactor(1.0)
+        model.setYarnAttnFactor(1.0)
+        model.setYarnBetaFast(32.0)
+        model.setYarnBetaSlow(1.0)
+        model.setYarnOrigCtx(0)
+        model.setDefragmentationThreshold(-1.0)
+        model.setNumaStrategy("DISTRIBUTE")
+        model.setRopeScalingType("UNSPECIFIED")
+        model.setPoolingType("UNSPECIFIED")
+        model.setModelDraft("")
+        model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
+        model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
+        model.setLoraBase("")
+        model.setEmbedding(False)
+        model.setFlashAttention(False)
+        model.setInputPrefixBos(False)
+        model.setUseMmap(False)
+        model.setUseMlock(False)
+        model.setNoKvOffload(False)
+        model.setSystemPrompt("")
+        model.setChatTemplate("")
+
+        # Inference Parameters
+        model.setInputPrefix("")
+        model.setInputSuffix("")
+        model.setCachePrompt(False)
+        model.setNPredict(-1)
+        model.setTopK(40)
+        model.setTopP(0.9)
+        model.setMinP(0.1)
+        model.setTfsZ(1.0)
+        model.setTypicalP(1.0)
+        model.setTemperature(0.8)
+        model.setDynamicTemperatureRange(0.0)
+        model.setDynamicTemperatureExponent(1.0)
+        model.setRepeatLastN(64)
+        model.setRepeatPenalty(1.0)
+        model.setFrequencyPenalty(0.0)
+        model.setPresencePenalty(0.0)
+        model.setMiroStat("DISABLED")
+        model.setMiroStatTau(5.0)
+        model.setMiroStatEta(0.1)
+        model.setPenalizeNl(False)
+        model.setNKeep(0)
+        model.setSeed(-1)
+        model.setNProbs(0)
+        model.setMinKeep(0)
+        model.setGrammar("")
+        model.setPenaltyPrompt("")
+        model.setIgnoreEos(False)
+        model.setDisableTokenIds([])
+        model.setStopStrings([])
+        model.setUseChatTemplate(False)
+        model.setNPredict(2)
+        model.setSamplers(["TOP_P", "TOP_K"])
+
+        # Special PySpark Parameters (Scala StructFeatures)
+        model.setTokenIdBias({0: 0.0, 1: 0.0})
+        model.setTokenBias({"!": 0.0, "?": 0.0})
+        model.setLoraAdapters({" ": 0.0})
+
+        pipeline = Pipeline().setStages([document_assembler, model])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("completions").show(truncate=False)
+
+
+@pytest.mark.slow
+class AutoGGUFModelMetadataTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        modelPath = "models/codellama-7b.Q2_K.gguf"
+        model = (
+            AutoGGUFModel.loadSavedModel(modelPath, self.spark)
+            .setInputCols("document")
+            .setOutputCol("completions")
+        )
+
+        metadata = model.getMetadata()
+        assert len(metadata) > 0
+        print(eval(metadata))
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index 703f281a1da1d1..6a17297f6fda41 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -1,4 +1,4 @@
-log4j.rootLogger=WARN, STDOUT
+log4j.rootLogger=DEBUG, STDOUT
 log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n
diff --git a/src/main/resources/log4j2.properties b/src/main/resources/log4j2.properties
index 703f281a1da1d1..5824b6f6cf7dab 100644
--- a/src/main/resources/log4j2.properties
+++ b/src/main/resources/log4j2.properties
@@ -1,4 +1,4 @@
-log4j.rootLogger=WARN, STDOUT
+log4j.rootLogger=debug, STDOUT
 log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n
diff --git a/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala
new file mode 100644
index 00000000000000..495e8cb2a6b0f9
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/gguf/GGUFWrapper.scala
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.ml.gguf
+
+import com.johnsnowlabs.nlp.llama.{LlamaModel, ModelParameters}
+import org.apache.spark.SparkFiles
+import org.apache.spark.sql.SparkSession
+import org.slf4j.{Logger, LoggerFactory}
+
+import java.io.File
+import java.nio.file.{Files, Paths}
+
+class GGUFWrapper(var modelFileName: String, var modelFolder: String) extends Serializable {
+
+  /** For Deserialization */
+  def this() = {
+    this(null, null)
+  }
+
+  // Important for serialization on none-kryo serializers
+  @transient private var llamaModel: LlamaModel = _
+
+  def getSession(modelParameters: ModelParameters): LlamaModel =
+    this.synchronized {
+      if (llamaModel == null) {
+        // TODO: Validate when modelFileName or tmpFolder is None??
+        val modelFilePath = SparkFiles.get(modelFileName)
+
+        if (Paths.get(modelFilePath).toFile.exists()) {
+          modelParameters.setModelFilePath(modelFilePath)
+          llamaModel = GGUFWrapper.withSafeGGUFModelLoader(modelParameters)
+        } else
+          throw new IllegalStateException(
+            s"Model file $modelFileName does not exist in SparkFiles.")
+      }
+      // TODO: if the model is already loaded then the model parameters will not apply. perhaps output a logline here.
+      llamaModel
+    }
+
+  def saveToFile(file: String): Unit = {
+    val modelFilePath = SparkFiles.get(modelFileName)
+    val modelOutputPath = Paths.get(file, modelFileName)
+    Files.copy(Paths.get(modelFilePath), modelOutputPath)
+  }
+
+  // Destructor to free the model when this object is garbage collected
+  override def finalize(): Unit = {
+    if (llamaModel != null) {
+      llamaModel.close()
+    }
+  }
+
+}
+
+/** Companion object */
+object GGUFWrapper {
+  private[GGUFWrapper] val logger: Logger = LoggerFactory.getLogger("GGUFWrapper")
+
+  // TODO: make sure this.synchronized is needed or it's not a bottleneck
+  private def withSafeGGUFModelLoader(modelParameters: ModelParameters): LlamaModel =
+    this.synchronized {
+      new LlamaModel(modelParameters) // TODO: Model parameters
+    }
+
+  def read(sparkSession: SparkSession, modelPath: String): GGUFWrapper = {
+    // TODO Better Sanity Check
+    val modelFile = new File(modelPath)
+    val modelFileExist: Boolean = modelFile.exists()
+
+    if (!modelFile.getName.endsWith(".gguf"))
+      throw new IllegalArgumentException(s"Model file $modelPath is not a GGUF model file")
+
+    if (modelFileExist) {
+      sparkSession.sparkContext.addFile(modelPath)
+    } else throw new IllegalArgumentException(s"Model file $modelPath does not exist")
+
+    new GGUFWrapper(modelFile.getName, modelFile.getParent)
+  }
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala
new file mode 100644
index 00000000000000..e6d832eef9a79f
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala
@@ -0,0 +1,1292 @@
+package com.johnsnowlabs.nlp
+
+import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
+import com.johnsnowlabs.nlp.llama.args._
+import com.johnsnowlabs.nlp.llama.{InferenceParameters, ModelParameters}
+import com.johnsnowlabs.nlp.serialization.StructFeature
+import org.apache.spark.ml.param._
+import org.slf4j.LoggerFactory
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters._
+
+/** Contains settable parameters for the [[AutoGGUFModel]].
+  *
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupprio setParam  1
+  * @groupprio getParam  2
+  * @groupprio param  3
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+trait HasLlamaCppProperties {
+  this: ParamsAndFeaturesWritable with HasProtectedParams =>
+  val logger = LoggerFactory.getLogger(this.getClass)
+  // ---------------- MODEL PARAMETERS ----------------
+  /** @group param */
+  val nThreads =
+    new IntParam(this, "nThreads", "Set the number of threads to use during generation")
+
+  /** @group param */
+  val nThreadsDraft = new IntParam(
+    this,
+    "nThreadsDraft",
+    "Set the number of threads to use during draft generation")
+
+  /** @group param */
+  val nThreadsBatch = new IntParam(
+    this,
+    "nThreadsBatch",
+    "Set the number of threads to use during batch and prompt processing")
+
+  /** @group param */
+  val nThreadsBatchDraft = new IntParam(
+    this,
+    "nThreadsBatchDraft",
+    "Set the number of threads to use during batch and prompt processing")
+
+  /** @group param */
+  val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context")
+
+  /** @group param */
+  val nBatch = new IntParam(
+    this,
+    "nBatch",
+    "Set the logical batch size for prompt processing (must be >=32 to use BLAS)")
+
+  /** @group param */
+  val nUbatch = new IntParam(
+    this,
+    "nUbatch",
+    "Set the physical batch size for prompt processing (must be >=32 to use BLAS)")
+
+  /** @group param */
+  val nDraft =
+    new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding")
+
+  /** @group param */
+  val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process")
+
+  /** @group param */
+  val nSequences =
+    new IntParam(this, "nSequences", "Set the number of sequences to decode")
+
+  /** @group param */
+  val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability")
+
+  /** @group param */
+  val nGpuLayers = new IntParam(
+    this,
+    "nGpuLayers",
+    "Set the number of layers to store in VRAM (-1 - use default)")
+
+  /** @group param */
+  val nGpuLayersDraft = new IntParam(
+    this,
+    "nGpuLayersDraft",
+    "Set the number of layers to store in VRAM for the draft model (-1 - use default)")
+
+  /** Set how to split the model across GPUs
+    *
+    *   - NONE: No GPU split
+    *   - LAYER: Split the model across GPUs by layer
+    *   - ROW: Split the model across GPUs by rows
+    *
+    * @group param
+    */
+  val gpuSplitMode =
+    new Param[String](this, "gpuSplitMode", "Set how to split the model across GPUs")
+
+  /** @group param */
+  val mainGpu =
+    new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.")
+
+  /** @group param */
+  val tensorSplit = new DoubleArrayParam(
+    this,
+    "tensorSplit",
+    "Set how split tensors should be distributed across GPUs")
+
+  /** @group param */
+  val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor")
+
+  /** @group param */
+  val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width")
+
+  /** @group param */
+  val ropeFreqBase =
+    new FloatParam(this, "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling")
+
+  /** @group param */
+  val ropeFreqScale = new FloatParam(
+    this,
+    "ropeFreqScale",
+    "Set the RoPE frequency scaling factor, expands context by a factor of 1/N")
+
+  /** @group param */
+  val yarnExtFactor =
+    new FloatParam(this, "yarnExtFactor", "Set the YaRN extrapolation mix factor")
+
+  /** @group param */
+  val yarnAttnFactor =
+    new FloatParam(this, "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude")
+
+  /** @group param */
+  val yarnBetaFast =
+    new FloatParam(this, "yarnBetaFast", "Set the YaRN low correction dim or beta")
+
+  /** @group param */
+  val yarnBetaSlow =
+    new FloatParam(this, "yarnBetaSlow", "Set the YaRN high correction dim or alpha")
+
+  /** @group param */
+  val yarnOrigCtx =
+    new IntParam(this, "yarnOrigCtx", "Set the YaRN original context size of model")
+
+  /** @group param */
+  val defragmentationThreshold =
+    new FloatParam(this, "defragmentationThreshold", "Set the KV cache defragmentation threshold")
+
+  /** Set optimization strategies that help on some NUMA systems (if available)
+    *
+    * Available Strategies:
+    *
+    *   - DISABLED: No NUMA optimizations
+    *   - DISTRIBUTE: Spread execution evenly over all
+    *   - ISOLATE: Only spawn threads on CPUs on the node that execution started on
+    *   - NUMA_CTL: Use the CPU map provided by numactl
+    *   - MIRROR: Mirrors the model across NUMA nodes
+    *
+    * @group param
+    */
+  val numaStrategy = new Param[String](
+    this,
+    "numaStrategy",
+    "Set optimization strategies that help on some NUMA systems (if available)")
+
+  /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+    *
+    *   - UNSPECIFIED: Don't use any scaling
+    *   - LINEAR: Linear scaling
+    *   - YARN: YaRN RoPE scaling
+    * @group param
+    */
+  val ropeScalingType = new Param[String](
+    this,
+    "ropeScalingType",
+    "Set the RoPE frequency scaling method, defaults to linear unless specified by the model")
+
+  /** Set the pooling type for embeddings, use model default if unspecified
+    *
+    *   - 0 UNSPECIFIED: Don't use any pooling
+    *   - 1 MEAN: Mean Pooling
+    *   - 2 CLS: CLS Pooling
+    *
+    * @group param
+    */
+  val poolingType = new Param[String](
+    this,
+    "poolingType",
+    "Set the pooling type for embeddings, use model default if unspecified")
+  //  model = new Param[String](this, "model", "Set the model file path to load")
+  /** @group param */
+  val modelDraft =
+    new Param[String](this, "modelDraft", "Set the draft model for speculative decoding")
+
+  //  modelAlias = new Param[String](this, "modelAlias", "Set a model alias")
+  /** @group param */
+  val lookupCacheStaticFilePath = new Param[String](
+    this,
+    "lookupCacheStaticFilePath",
+    "Set path to static lookup cache to use for lookup decoding (not updated by generation)")
+
+  /** @group param */
+  val lookupCacheDynamicFilePath = new Param[String](
+    this,
+    "lookupCacheDynamicFilePath",
+    "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)")
+
+  /** @group param */
+  val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
+
+  val embedding =
+    new BooleanParam(this, "embedding", "Whether to load model with embedding support")
+
+  /** @group param */
+  val flashAttention =
+    new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention")
+
+  /** @group param */
+  val inputPrefixBos = new BooleanParam(
+    this,
+    "inputPrefixBos",
+    "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string")
+
+  /** @group param */
+  val useMmap = new BooleanParam(
+    this,
+    "useMmap",
+    "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)")
+
+  /** @group param */
+  val useMlock = new BooleanParam(
+    this,
+    "useMlock",
+    "Whether to force the system to keep model in RAM rather than swapping or compressing")
+
+  /** @group param */
+  val noKvOffload = new BooleanParam(this, "noKvOffload", "Whether to disable KV offload")
+
+  /** @group param */
+  val systemPrompt = new Param[String](this, "systemPrompt", "Set a system prompt to use")
+
+  /** @group param */
+  val chatTemplate =
+    new Param[String](this, "chatTemplate", "The chat template to use")
+
+  /** Set the number of threads to use during generation
+    *
+    * @group setParam
+    */
+  def setNThreads(nThreads: Int): this.type = { set(this.nThreads, nThreads) }
+
+  /** Set the number of threads to use during draft generation
+    *
+    * @group setParam
+    */
+  def setNThreadsDraft(nThreadsDraft: Int): this.type = { set(this.nThreadsDraft, nThreadsDraft) }
+
+  /** Set the number of threads to use during batch and prompt processing
+    *
+    * @group setParam
+    */
+  def setNThreadsBatch(nThreadsBatch: Int): this.type = { set(this.nThreadsBatch, nThreadsBatch) }
+
+  /** Set the number of threads to use during batch and prompt processing
+    *
+    * @group setParam
+    */
+  def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = {
+    set(this.nThreadsBatchDraft, nThreadsBatchDraft)
+  }
+
+  /** Set the size of the prompt context
+    *
+    * @group setParam
+    */
+  def setNCtx(nCtx: Int): this.type = { set(this.nCtx, nCtx) }
+
+  /** Set the logical batch size for prompt processing (must be >=32 to use BLAS)
+    *
+    * @group setParam
+    */
+  def setNBatch(nBatch: Int): this.type = { set(this.nBatch, nBatch) }
+
+  /** Set the physical batch size for prompt processing (must be >=32 to use BLAS)
+    *
+    * @group setParam
+    */
+  def setNUbatch(nUbatch: Int): this.type = { set(this.nUbatch, nUbatch) }
+
+  /** Set the number of tokens to draft for speculative decoding
+    *
+    * @group setParam
+    */
+  def setNDraft(nDraft: Int): this.type = { set(this.nDraft, nDraft) }
+
+  /** Set the maximal number of chunks to process
+    *
+    * @group setParam
+    */
+  def setNChunks(nChunks: Int): this.type = { set(this.nChunks, nChunks) }
+
+  /** Set the number of sequences to decode
+    *
+    * @group setParam
+    */
+  def setNSequences(nSequences: Int): this.type = { set(this.nSequences, nSequences) }
+
+  /** Set the speculative decoding split probability
+    *
+    * @group setParam
+    */
+  def setPSplit(pSplit: Float): this.type = { set(this.pSplit, pSplit) }
+
+  /** Set the number of layers to store in VRAM (-1 - use default)
+    *
+    * @group setParam
+    */
+  def setNGpuLayers(nGpuLayers: Int): this.type = { set(this.nGpuLayers, nGpuLayers) }
+
+  /** Set the number of layers to store in VRAM for the draft model (-1 - use default)
+    *
+    * @group setParam
+    */
+  def setNGpuLayersDraft(nGpuLayersDraft: Int): this.type = {
+    set(this.nGpuLayersDraft, nGpuLayersDraft)
+  }
+
+  /** Set how to split the model across GPUs
+    *
+    *   - NONE: No GPU split
+    * -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows
+    *
+    * @group setParam
+    */
+  def setGpuSplitMode(splitMode: String): this.type = { set(this.gpuSplitMode, splitMode) }
+
+  /** Set the GPU that is used for scratch and small tensors
+    *
+    * @group setParam
+    */
+  def setMainGpu(mainGpu: Int): this.type = { set(this.mainGpu, mainGpu) }
+
+  /** Set how split tensors should be distributed across GPUs
+    *
+    * @group setParam
+    */
+  def setTensorSplit(tensorSplit: Array[Double]): this.type = {
+    set(this.tensorSplit, tensorSplit)
+  }
+
+  /** Set the group-attention factor
+    *
+    * @group setParam
+    */
+  def setGrpAttnN(grpAttnN: Int): this.type = { set(this.grpAttnN, grpAttnN) }
+
+  /** Set the group-attention width
+    *
+    * @group setParam
+    */
+  def setGrpAttnW(grpAttnW: Int): this.type = { set(this.grpAttnW, grpAttnW) }
+
+  /** Set the RoPE base frequency, used by NTK-aware scaling
+    *
+    * @group setParam
+    */
+  def setRopeFreqBase(ropeFreqBase: Float): this.type = { set(this.ropeFreqBase, ropeFreqBase) }
+
+  /** Set the RoPE frequency scaling factor, expands context by a factor of 1/N
+    *
+    * @group setParam
+    */
+  def setRopeFreqScale(ropeFreqScale: Float): this.type = {
+    set(this.ropeFreqScale, ropeFreqScale)
+  }
+
+  /** Set the YaRN extrapolation mix factor
+    *
+    * @group setParam
+    */
+  def setYarnExtFactor(yarnExtFactor: Float): this.type = {
+    set(this.yarnExtFactor, yarnExtFactor)
+  }
+
+  /** Set the YaRN scale sqrt(t) or attention magnitude
+    *
+    * @group setParam
+    */
+  def setYarnAttnFactor(yarnAttnFactor: Float): this.type = {
+    set(this.yarnAttnFactor, yarnAttnFactor)
+  }
+
+  /** Set the YaRN low correction dim or beta
+    *
+    * @group setParam
+    */
+  def setYarnBetaFast(yarnBetaFast: Float): this.type = { set(this.yarnBetaFast, yarnBetaFast) }
+
+  /** Set the YaRN high correction dim or alpha
+    *
+    * @group setParam
+    */
+  def setYarnBetaSlow(yarnBetaSlow: Float): this.type = { set(this.yarnBetaSlow, yarnBetaSlow) }
+
+  /** Set the YaRN original context size of model
+    *
+    * @group setParam
+    */
+  def setYarnOrigCtx(yarnOrigCtx: Int): this.type = { set(this.yarnOrigCtx, yarnOrigCtx) }
+
+  /** Set the KV cache defragmentation threshold
+    *
+    * @group setParam
+    */
+  def setDefragmentationThreshold(defragThold: Float): this.type = {
+    set(this.defragmentationThreshold, defragThold)
+  }
+
+  /** Set optimization strategies that help on some NUMA systems (if available)
+    *
+    * Available Strategies:
+    *
+    *   - DISABLED: No NUMA optimizations
+    *   - DISTRIBUTE: spread execution evenly over all
+    *   - ISOLATE: only spawn threads on CPUs on the node that execution started on
+    *   - NUMA_CTL: use the CPU map provided by numactl
+    *   - MIRROR: Mirrors the model across NUMA nodes
+    *
+    * @group setParam
+    */
+  def setNumaStrategy(numa: String): this.type = { set(this.numaStrategy, numa) }
+
+  /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+    *
+    *   - UNSPECIFIED: Don't use any scaling
+    *   - LINEAR: Linear scaling
+    *   - YARN: YaRN RoPE scaling
+    * @group setParam
+    */
+  def setRopeScalingType(ropeScalingType: String): this.type = {
+    set(this.ropeScalingType, ropeScalingType)
+  }
+
+  /** Set the pooling type for embeddings, use model default if unspecified
+    *
+    *   - UNSPECIFIED: Don't use any pooling
+    *   - MEAN: Mean Pooling
+    *   - CLS: CLS Pooling
+    *
+    * @group setParam
+    */
+  def setPoolingType(poolingType: String): this.type = { set(this.poolingType, poolingType) }
+
+  /** Set the draft model for speculative decoding
+    *
+    * @group setParam
+    */
+  def setModelDraft(modelDraft: String): this.type = { set(this.modelDraft, modelDraft) }
+
+  /** Set a model alias
+    *
+    * @group setParam
+    */
+  def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = {
+    set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath)
+  }
+
+  /** Set a model alias
+    *
+    * @group setParam
+    */
+  def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = {
+    set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath)
+  }
+
+  /** Sets paths to lora adapters with user defined scale.
+    *
+    * @group setParam
+    */
+  def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = {
+    set(this.loraAdapters, loraAdapters)
+  }
+
+  /** Sets paths to lora adapters with user defined scale. (PySpark Override)
+    *
+    * @group setParam
+    */
+  def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = {
+    val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() }
+    set(this.loraAdapters, scalaLoraAdapters.toMap)
+  }
+
+  /** Whether to load model with embedding support
+    *
+    * @group setParam
+    */
+  def setEmbedding(embedding: Boolean): this.type = { set(this.embedding, embedding) }
+
+  /** Whether to enable Flash Attention
+    *
+    * @group setParam
+    */
+  def setFlashAttention(flashAttention: Boolean): this.type = {
+    set(this.flashAttention, flashAttention)
+  }
+
+  /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string
+    *
+    * @group setParam
+    */
+  def setInputPrefixBos(inputPrefixBos: Boolean): this.type = {
+    set(this.inputPrefixBos, inputPrefixBos)
+  }
+
+  /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
+    *
+    * @group setParam
+    */
+  def setUseMmap(useMmap: Boolean): this.type = { set(this.useMmap, useMmap) }
+
+  /** Whether to force the system to keep model in RAM rather than swapping or compressing
+    *
+    * @group setParam
+    */
+  def setUseMlock(useMlock: Boolean): this.type = { set(this.useMlock, useMlock) }
+
+  /** Whether to disable KV offload
+    *
+    * @group setParam
+    */
+  def setNoKvOffload(noKvOffload: Boolean): this.type = { set(this.noKvOffload, noKvOffload) }
+
+  /** Set a system prompt to use
+    *
+    * @group setParam
+    */
+  def setSystemPrompt(systemPrompt: String): this.type = { set(this.systemPrompt, systemPrompt) }
+
+  /** The chat template to use
+    *
+    * @group setParam
+    */
+  def setChatTemplate(chatTemplate: String): this.type = { set(this.chatTemplate, chatTemplate) }
+
+  // ---------------- GETTERS ----------------
+  /** @group getParam */
+  def getNThreads: Int = $(nThreads)
+
+  /** @group getParam */
+  def getNThreadsDraft: Int = $(nThreadsDraft)
+
+  /** @group getParam */
+  def getNThreadsBatch: Int = $(nThreadsBatch)
+
+  /** @group getParam */
+  def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft)
+
+  /** @group getParam */
+  def getNCtx: Int = $(nCtx)
+
+  /** @group getParam */
+  def getNBatch: Int = $(nBatch)
+
+  /** @group getParam */
+  def getNUbatch: Int = $(nUbatch)
+
+  /** @group getParam */
+  def getNDraft: Int = $(nDraft)
+
+  /** @group getParam */
+  def getNChunks: Int = $(nChunks)
+
+  /** @group getParam */
+  def getNSequences: Int = $(nSequences)
+
+  /** @group getParam */
+  def getPSplit: Float = $(pSplit)
+
+  /** @group getParam */
+  def getNGpuLayers: Int = $(nGpuLayers)
+
+  /** @group getParam */
+  def getNGpuLayersDraft: Int = $(nGpuLayersDraft)
+
+  /** @group getParam */
+  def getSplitMode: String = $(gpuSplitMode)
+
+  /** @group getParam */
+  def getMainGpu: Int = $(mainGpu)
+
+  /** @group getParam */
+  def getTensorSplit: Array[Double] = $(tensorSplit)
+
+  def getGrpAttnN: Int = $(grpAttnN)
+
+  /** @group getParam */
+  def getGrpAttnW: Int = $(grpAttnW)
+
+  /** @group getParam */
+  def getRopeFreqBase: Float = $(ropeFreqBase)
+
+  /** @group getParam */
+  def getRopeFreqScale: Float = $(ropeFreqScale)
+
+  /** @group getParam */
+  def getYarnExtFactor: Float = $(yarnExtFactor)
+
+  /** @group getParam */
+  def getYarnAttnFactor: Float = $(yarnAttnFactor)
+
+  /** @group getParam */
+  def getYarnBetaFast: Float = $(yarnBetaFast)
+
+  /** @group getParam */
+  def getYarnBetaSlow: Float = $(yarnBetaSlow)
+
+  /** @group getParam */
+  def getYarnOrigCtx: Int = $(yarnOrigCtx)
+
+  /** @group getParam */
+  def getDefragmentationThreshold: Float = $(defragmentationThreshold)
+
+  /** @group getParam */
+  def getNuma: String = $(numaStrategy)
+
+  /** @group getParam */
+  def getRopeScalingType: String = $(ropeScalingType)
+
+  /** @group getParam */
+  def getPoolingType: String = $(poolingType)
+
+  /** @group getParam */
+  def getModelDraft: String = $(modelDraft)
+
+  /** @group getParam */
+  def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath)
+
+  /** @group getParam */
+  def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath)
+
+  /** @group getParam */
+  def getLoraAdapters: Map[String, Float] = $$(loraAdapters)
+
+  /** @group getParam */
+  def getEmbedding: Boolean = $(embedding)
+
+  /** @group getParam */
+  def getFlashAttention: Boolean = $(flashAttention)
+
+  /** @group getParam */
+  def getInputPrefixBos: Boolean = $(inputPrefixBos)
+
+  /** @group getParam */
+  def getUseMmap: Boolean = $(useMmap)
+
+  /** @group getParam */
+  def getUseMlock: Boolean = $(useMlock)
+
+  /** @group getParam */
+  def getNoKvOffload: Boolean = $(noKvOffload)
+
+  /** @group getParam */
+  def getSystemPrompt: String = $(systemPrompt)
+
+  /** @group getParam */
+  def getChatTemplate: String = $(chatTemplate)
+
+  // ---------------- INFERENCE PARAMETERS ----------------
+  /** @group param */
+  val inputPrefix =
+    new Param[String](this, "inputPrefix", "Set the prompt to start generation with")
+
+  /** @group param */
+  val inputSuffix =
+    new Param[String](this, "inputSuffix", "Set a suffix for infilling")
+
+  /** @group param */
+  val cachePrompt = new BooleanParam(
+    this,
+    "cachePrompt",
+    "Whether to remember the prompt to avoid reprocessing it")
+
+  /** @group param */
+  val nPredict = new IntParam(this, "nPredict", "Set the number of tokens to predict")
+
+  /** @group param */
+  val topK = new IntParam(this, "topK", "Set top-k sampling")
+
+  /** @group param */
+  val topP = new FloatParam(this, "topP", "Set top-p sampling")
+
+  /** @group param */
+  val minP = new FloatParam(this, "minP", "Set min-p sampling")
+
+  /** @group param */
+  val tfsZ = new FloatParam(this, "tfsZ", "Set tail free sampling, parameter z")
+
+  /** @group param */
+  val typicalP = new FloatParam(this, "typicalP", "Set locally typical sampling, parameter p")
+
+  /** @group param */
+  val temperature = new FloatParam(this, "temperature", "Set the temperature")
+
+  /** @group param */
+  val dynamicTemperatureRange =
+    new FloatParam(this, "dynatempRange", "Set the dynamic temperature range")
+
+  /** @group param */
+  val dynamicTemperatureExponent =
+    new FloatParam(this, "dynatempExponent", "Set the dynamic temperature exponent")
+
+  /** @group param */
+  val repeatLastN =
+    new IntParam(this, "repeatLastN", "Set the last n tokens to consider for penalties")
+
+  /** @group param */
+  val repeatPenalty =
+    new FloatParam(this, "repeatPenalty", "Set the penalty of repeated sequences of tokens")
+
+  /** @group param */
+  val frequencyPenalty =
+    new FloatParam(this, "frequencyPenalty", "Set the repetition alpha frequency penalty")
+
+  /** @group param */
+  val presencePenalty =
+    new FloatParam(this, "presencePenalty", "Set the repetition alpha presence penalty")
+
+  /** @group param */
+  val miroStat = new Param[String](this, "miroStat", "Set MiroStat sampling strategies.")
+
+  /** @group param */
+  val miroStatTau =
+    new FloatParam(this, "mirostatTau", "Set the MiroStat target entropy, parameter tau")
+
+  /** @group param */
+  val miroStatEta =
+    new FloatParam(this, "mirostatEta", "Set the MiroStat learning rate, parameter eta")
+
+  /** @group param */
+  val penalizeNl = new BooleanParam(this, "penalizeNl", "Whether to penalize newline tokens")
+
+  /** @group param */
+  val nKeep =
+    new IntParam(this, "nKeep", "Set the number of tokens to keep from the initial prompt")
+
+  /** @group param */
+  val seed = new IntParam(this, "seed", "Set the RNG seed")
+
+  /** @group param */
+  val nProbs = new IntParam(
+    this,
+    "nProbs",
+    "Set the amount top tokens probabilities to output if greater than 0.")
+
+  /** @group param */
+  val minKeep = new IntParam(
+    this,
+    "minKeep",
+    "Set the amount of tokens the samplers should return at least (0 = disabled)")
+
+  /** @group param */
+  val grammar =
+    new Param[String](this, "grammar", "Set BNF-like grammar to constrain generations")
+
+  /** @group param */
+  val penaltyPrompt = new Param[String](
+    this,
+    "penaltyPrompt",
+    "Override which part of the prompt is penalized for repetition.")
+
+  /** @group param */
+  val ignoreEos = new BooleanParam(
+    this,
+    "ignoreEos",
+    "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)")
+
+  // Modify the likelihood of tokens appearing in the completion by their id.
+  val tokenIdBias: StructFeature[Map[Int, Float]] =
+    new StructFeature[Map[Int, Float]](this, "tokenIdBias")
+
+  // Modify the likelihood of tokens appearing in the completion by their string.
+  /** @group param */
+  val tokenBias: StructFeature[Map[String, Float]] =
+    new StructFeature[Map[String, Float]](this, "tokenBias")
+
+  /** @group param */
+  val disableTokenIds =
+    new IntArrayParam(this, "disableTokenIds", "Set the token ids to disable in the completion")
+
+  /** @group param */
+  val stopStrings = new StringArrayParam(
+    this,
+    "stopStrings",
+    "Set strings upon seeing which token generation is stopped")
+
+  /** @group param */
+  val samplers = new StringArrayParam(
+    this,
+    "samplers",
+    "Set which samplers to use for token generation in the given order")
+
+  /** @group param */
+  val useChatTemplate = new BooleanParam(
+    this,
+    "useChatTemplate",
+    "Set whether or not generate should apply a chat template")
+
+  /** Set the prompt to start generation with
+    *
+    * @group setParam
+    */
+  def setInputPrefix(inputPrefix: String): this.type = { set(this.inputPrefix, inputPrefix) }
+
+  /** Set a suffix for infilling
+    *
+    * @group setParam
+    */
+  def setInputSuffix(inputSuffix: String): this.type = { set(this.inputSuffix, inputSuffix) }
+
+  /** Whether to remember the prompt to avoid reprocessing it
+    *
+    * @group setParam
+    */
+  def setCachePrompt(cachePrompt: Boolean): this.type = { set(this.cachePrompt, cachePrompt) }
+
+  /** Set the number of tokens to predict
+    *
+    * @group setParam
+    */
+  def setNPredict(nPredict: Int): this.type = { set(this.nPredict, nPredict) }
+
+  /** Set top-k sampling
+    *
+    * @group setParam
+    */
+  def setTopK(topK: Int): this.type = { set(this.topK, topK) }
+
+  /** Set top-p sampling
+    *
+    * @group setParam
+    */
+  def setTopP(topP: Float): this.type = { set(this.topP, topP) }
+
+  /** Set min-p sampling
+    *
+    * @group setParam
+    */
+  def setMinP(minP: Float): this.type = { set(this.minP, minP) }
+
+  /** Set tail free sampling, parameter z
+    * @group setParam
+    */
+  def setTfsZ(tfsZ: Float): this.type = { set(this.tfsZ, tfsZ) }
+
+  /** Set locally typical sampling, parameter p
+    *
+    * @group setParam
+    */
+  def setTypicalP(typicalP: Float): this.type = { set(this.typicalP, typicalP) }
+
+  /** Set the temperature
+    *
+    * @group setParam
+    */
+  def setTemperature(temperature: Float): this.type = { set(this.temperature, temperature) }
+
+  /** Set the dynamic temperature range
+    *
+    * @group setParam
+    */
+  def setDynamicTemperatureRange(dynatempRange: Float): this.type = {
+    set(this.dynamicTemperatureRange, dynatempRange)
+  }
+
+  /** Set the dynamic temperature exponent
+    *
+    * @group setParam
+    */
+  def setDynamicTemperatureExponent(dynatempExponent: Float): this.type = {
+    set(this.dynamicTemperatureExponent, dynatempExponent)
+  }
+
+  /** Set the last n tokens to consider for penalties
+    *
+    * @group setParam
+    */
+  def setRepeatLastN(repeatLastN: Int): this.type = { set(this.repeatLastN, repeatLastN) }
+
+  /** Set the penalty of repeated sequences of tokens
+    *
+    * @group setParam
+    */
+  def setRepeatPenalty(repeatPenalty: Float): this.type = {
+    set(this.repeatPenalty, repeatPenalty)
+  }
+
+  /** Set the repetition alpha frequency penalty
+    *
+    * @group setParam
+    */
+  def setFrequencyPenalty(frequencyPenalty: Float): this.type = {
+    set(this.frequencyPenalty, frequencyPenalty)
+  }
+
+  /** Set the repetition alpha presence penalty
+    *
+    * @group setParam
+    */
+  def setPresencePenalty(presencePenalty: Float): this.type = {
+    set(this.presencePenalty, presencePenalty)
+  }
+
+  /** Set MiroStat sampling strategies.
+    *
+    *   - DISABLED: No MiroStat
+    *   - V1: MiroStat V1
+    *   - V2: MiroStat V2
+    *
+    * @group setParam
+    */
+  def setMiroStat(mirostat: String): this.type = set(this.miroStat, mirostat)
+
+  /** Set the MiroStat target entropy, parameter tau
+    *
+    * @group setParam
+    */
+  def setMiroStatTau(mirostatTau: Float): this.type = { set(this.miroStatTau, mirostatTau) }
+
+  /** Set the MiroStat learning rate, parameter eta
+    *
+    * @group setParam
+    */
+  def setMiroStatEta(mirostatEta: Float): this.type = { set(this.miroStatEta, mirostatEta) }
+
+  /** Set whether to penalize newline tokens
+    *
+    * @group setParam
+    */
+  def setPenalizeNl(penalizeNl: Boolean): this.type = { set(this.penalizeNl, penalizeNl) }
+
+  /** Set the number of tokens to keep from the initial prompt
+    *
+    * @group setParam
+    */
+  def setNKeep(nKeep: Int): this.type = { set(this.nKeep, nKeep) }
+
+  /** Set the RNG seed
+    *
+    * @group setParam
+    */
+  def setSeed(seed: Int): this.type = { set(this.seed, seed) }
+
+  /** Set the amount top tokens probabilities to output if greater than 0.
+    *
+    * @group setParam
+    */
+  def setNProbs(nProbs: Int): this.type = { set(this.nProbs, nProbs) }
+
+  /** Set the amount of tokens the samplers should return at least (0 = disabled)
+    *
+    * @group setParam
+    */
+  def setMinKeep(minKeep: Int): this.type = { set(this.minKeep, minKeep) }
+
+  /** Set BNF-like grammar to constrain generations
+    *
+    * @group setParam
+    */
+  def setGrammar(grammar: String): this.type = { set(this.grammar, grammar) }
+
+  /** Override which part of the prompt is penalized for repetition.
+    *
+    * @group setParam
+    */
+  def setPenaltyPrompt(penaltyPrompt: String): this.type = {
+    set(this.penaltyPrompt, penaltyPrompt)
+  }
+
+  /** Set whether to ignore end of stream token and continue generating (implies --logit-bias
+    * 2-inf)
+    *
+    * @group setParam
+    */
+  def setIgnoreEos(ignoreEos: Boolean): this.type = { set(this.ignoreEos, ignoreEos) }
+
+  /** Set the tokens to disable during completion.
+    *
+    * @group setParam
+    */
+  def setTokenBias(tokenBias: Map[String, Float]): this.type = {
+    set(this.tokenBias, tokenBias)
+  }
+
+  /** Set the tokens to disable during completion. (Override for PySpark)
+    *
+    * @group setParam
+    */
+  def setTokenBias(tokenBias: java.util.HashMap[String, java.lang.Double]): this.type = {
+    val scalaTokenBias = tokenBias.asScala.map { case (k, v) => k -> v.floatValue() }
+    set(this.tokenBias, scalaTokenBias.toMap)
+  }
+
+  /** Set the token ids to disable in the completion.
+    *
+    * @group setParam
+    */
+  def setTokenIdBias(tokenIdBias: Map[Int, Float]): this.type = {
+    set(this.tokenIdBias, tokenIdBias)
+  }
+
+  /** Set the token ids to disable in the completion. (Override for PySpark)
+    *
+    * @group setParam
+    */
+  def setTokenIdBias(tokenIdBias: java.util.HashMap[Integer, java.lang.Double]): this.type = {
+    val scalaTokenIdBias = tokenIdBias.asScala.map { case (k, v) => k.toInt -> v.toFloat }
+    set(this.tokenIdBias, scalaTokenIdBias.toMap)
+  }
+
+  /** Set the token ids to disable in the completion. This corresponds to `setTokenBias` with a
+    * value of `Float.NEGATIVE_INFINITY`.
+    *
+    * @group setParam
+    */
+  def setDisableTokenIds(disableTokenIds: Array[Int]): this.type = {
+    set(this.disableTokenIds, disableTokenIds)
+  }
+
+  /** Set strings upon seeing which token generation is stopped
+    *
+    * @group setParam
+    */
+  def setStopStrings(stopStrings: Array[String]): this.type = {
+    set(this.stopStrings, stopStrings)
+  }
+
+  /** Set which samplers to use for token generation in the given order .
+    *
+    * Available Samplers are:
+    *
+    *   - TOP_K: Top-k sampling
+    *   - TFS_Z: Tail free sampling
+    *   - TYPICAL_P: Locally typical sampling p
+    *   - TOP_P: Top-p sampling
+    *   - MIN_P: Min-p sampling
+    *   - TEMPERATURE: Temperature sampling
+    * @group setParam
+    */
+  def setSamplers(samplers: Array[String]): this.type = { set(this.samplers, samplers) }
+
+  /** Set whether or not generate should apply a chat template
+    *
+    * @group setParam
+    */
+  def setUseChatTemplate(useChatTemplate: Boolean): this.type = {
+    set(this.useChatTemplate, useChatTemplate)
+  }
+
+  // ---------------- GETTERS ----------------
+  /** @group getParam */
+  def getInputPrefix: String = $(inputPrefix)
+
+  /** @group getParam */
+  def getInputSuffix: String = $(inputSuffix)
+
+  /** @group getParam */
+  def getCachePrompt: Boolean = $(cachePrompt)
+
+  def getNPredict: Int = $(nPredict)
+
+  /** @group getParam */
+  def getTopK: Int = $(topK)
+
+  /** @group getParam */
+  def getTopP: Float = $(topP)
+
+  /** @group getParam */
+  def getMinP: Float = $(minP)
+
+  /** @group getParam */
+  def getTfsZ: Float = $(tfsZ)
+
+  /** @group getParam */
+  def getTypicalP: Float = $(typicalP)
+
+  /** @group getParam */
+  def getTemperature: Float = $(temperature)
+
+  /** @group getParam */
+  def getDynamicTemperatureRange: Float = $(dynamicTemperatureRange)
+
+  /** @group getParam */
+  def getDynamicTemperatureExponent: Float = $(dynamicTemperatureExponent)
+
+  /** @group getParam */
+  def getRepeatLastN: Int = $(repeatLastN)
+
+  /** @group getParam */
+  def getRepeatPenalty: Float = $(repeatPenalty)
+
+  /** @group getParam */
+  def getFrequencyPenalty: Float = $(frequencyPenalty)
+
+  /** @group getParam */
+  def getPresencePenalty: Float = $(presencePenalty)
+
+  /** @group getParam */
+  def getMiroStat: String = $(miroStat)
+
+  /** @group getParam */
+  def getMiroStatTau: Float = $(miroStatTau)
+
+  /** @group getParam */
+  def getMiroStatEta: Float = $(miroStatEta)
+
+  /** @group getParam */
+  def getPenalizeNl: Boolean = $(penalizeNl)
+
+  /** @group getParam */
+  def getNKeep: Int = $(nKeep)
+
+  /** @group getParam */
+  def getSeed: Int = $(seed)
+
+  /** @group getParam */
+  def getNProbs: Int = $(nProbs)
+
+  /** @group getParam */
+  def getMinKeep: Int = $(minKeep)
+
+  /** @group getParam */
+  def getGrammar: String = $(grammar)
+
+  /** @group getParam */
+  def getPenaltyPrompt: String = $(penaltyPrompt)
+
+  /** @group getParam */
+  def getIgnoreEos: Boolean = $(ignoreEos)
+
+  /** @group getParam */
+  def getTokenIdBias: Map[Int, Float] = $$(tokenIdBias)
+
+  /** @group getParam */
+  def getTokenBias: Map[String, Float] = $$(tokenBias)
+
+  /** @group getParam */
+  def getDisableTokenIds: Array[Int] = $(disableTokenIds)
+
+  /** @group getParam */
+  def getStopStrings: Array[String] = $(stopStrings)
+
+  /** @group getParam */
+  def getSamplers: Array[String] = $(samplers)
+
+  /** @group getParam */
+  def getUseChatTemplate: Boolean = $(useChatTemplate)
+
+  protected def getModelParameters: ModelParameters = {
+    val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled
+
+    if (isDefined(chatTemplate)) modelParameters.setChatTemplate($(chatTemplate))
+    if (isDefined(defragmentationThreshold))
+      modelParameters.setDefragmentationThreshold($(defragmentationThreshold))
+    if (isDefined(embedding)) modelParameters.setEmbedding($(embedding))
+    if (isDefined(flashAttention)) modelParameters.setFlashAttention($(flashAttention))
+    if (isDefined(gpuSplitMode))
+      modelParameters.setSplitMode(GpuSplitMode.valueOf($(gpuSplitMode)))
+    if (isDefined(grpAttnN)) modelParameters.setGrpAttnN($(grpAttnN))
+    if (isDefined(grpAttnW)) modelParameters.setGrpAttnN($(grpAttnW))
+    if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos($(inputPrefixBos))
+    if (isDefined(lookupCacheDynamicFilePath))
+      modelParameters.setLookupCacheDynamicFilePath($(lookupCacheDynamicFilePath))
+    if (isDefined(lookupCacheStaticFilePath))
+      modelParameters.setLookupCacheStaticFilePath($(lookupCacheStaticFilePath))
+    if (isDefined(mainGpu)) modelParameters.setMainGpu($(mainGpu))
+    if (isDefined(modelDraft)) modelParameters.setModelDraft($(modelDraft))
+    if (isDefined(nBatch)) modelParameters.setNBatch($(nBatch))
+    if (isDefined(nChunks)) modelParameters.setNChunks($(nChunks))
+    if (isDefined(nCtx)) modelParameters.setNCtx($(nCtx))
+    if (isDefined(nDraft)) modelParameters.setNDraft($(nDraft))
+    if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers($(nGpuLayers))
+    if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft($(nGpuLayersDraft))
+    if (isDefined(nSequences)) modelParameters.setNSequences($(nSequences))
+    if (isDefined(nThreads)) modelParameters.setNThreads($(nThreads))
+    if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch($(nThreadsBatch))
+    if (isDefined(nThreadsBatchDraft))
+      modelParameters.setNThreadsBatchDraft($(nThreadsBatchDraft))
+    if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft($(nThreadsDraft))
+    if (isDefined(nUbatch)) modelParameters.setNUbatch($(nUbatch))
+    if (isDefined(noKvOffload)) modelParameters.setNoKvOffload($(noKvOffload))
+    if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf($(numaStrategy)))
+    if (isDefined(pSplit)) modelParameters.setPSplit($(pSplit))
+    if (isDefined(poolingType))
+      modelParameters.setPoolingType(PoolingType.valueOf($(poolingType)))
+    if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase($(ropeFreqBase))
+    if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale($(ropeFreqScale))
+    if (isDefined(ropeScalingType))
+      modelParameters.setRopeScalingType(RopeScalingType.valueOf($(ropeScalingType)))
+    if (isDefined(systemPrompt)) modelParameters.setSystemPrompt($(systemPrompt))
+    if (isDefined(tensorSplit)) modelParameters.setTensorSplit($(tensorSplit).map(_.toFloat))
+    if (isDefined(useMlock)) modelParameters.setUseMlock($(useMlock))
+    if (isDefined(useMmap)) modelParameters.setUseMmap($(useMmap))
+    if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor($(yarnAttnFactor))
+    if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast($(yarnBetaFast))
+    if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow($(yarnBetaSlow))
+    if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor($(yarnExtFactor))
+    if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx($(yarnOrigCtx))
+    if (loraAdapters.isSet) {
+      val loraAdaptersMap: mutable.Map[String, java.lang.Float] =
+        mutable.Map($$(loraAdapters).map { case (key, value) =>
+          (key, float2Float(value))
+        }.toSeq: _*)
+      modelParameters.setLoraAdapters(loraAdaptersMap.asJava)
+    } // Need to convert to mutable map first
+
+    modelParameters
+  }
+
+  protected def getInferenceParameters: InferenceParameters = {
+    val inferenceParams = new InferenceParameters("")
+    if (isDefined(cachePrompt)) inferenceParams.setCachePrompt($(cachePrompt))
+    if (isDefined(disableTokenIds)) {
+      val javaCollection: java.util.Collection[Integer] =
+        $(disableTokenIds).map(int2Integer).toSeq.asJava
+      inferenceParams.disableTokenIds(javaCollection)
+    }
+    if (isDefined(dynamicTemperatureExponent))
+      inferenceParams.setDynamicTemperatureExponent($(dynamicTemperatureExponent))
+    if (isDefined(dynamicTemperatureRange))
+      inferenceParams.setDynamicTemperatureRange($(dynamicTemperatureRange))
+    if (isDefined(frequencyPenalty)) inferenceParams.setFrequencyPenalty($(frequencyPenalty))
+    if (isDefined(grammar)) inferenceParams.setGrammar($(grammar))
+    if (isDefined(ignoreEos)) inferenceParams.setIgnoreEos($(ignoreEos))
+    if (isDefined(inputPrefix)) inferenceParams.setInputPrefix($(inputPrefix))
+    if (isDefined(inputSuffix)) inferenceParams.setInputSuffix($(inputSuffix))
+    if (isDefined(minKeep)) inferenceParams.setMinKeep($(minKeep))
+    if (isDefined(minP)) inferenceParams.setMinP($(minP))
+    if (isDefined(miroStat)) inferenceParams.setMiroStat(MiroStat.valueOf($(miroStat)))
+    if (isDefined(miroStatEta)) inferenceParams.setMiroStatEta($(miroStatEta))
+    if (isDefined(miroStatTau)) inferenceParams.setMiroStatTau($(miroStatTau))
+    if (isDefined(nKeep)) inferenceParams.setNKeep($(nKeep))
+    if (isDefined(nPredict)) inferenceParams.setNPredict($(nPredict))
+    if (isDefined(nProbs)) inferenceParams.setNProbs($(nProbs))
+    if (isDefined(penalizeNl)) inferenceParams.setPenalizeNl($(penalizeNl))
+    if (isDefined(penaltyPrompt)) inferenceParams.setPenaltyPrompt($(penaltyPrompt))
+    if (isDefined(presencePenalty)) inferenceParams.setPresencePenalty($(presencePenalty))
+    if (isDefined(repeatLastN)) inferenceParams.setRepeatLastN($(repeatLastN))
+    if (isDefined(repeatPenalty)) inferenceParams.setRepeatPenalty($(repeatPenalty))
+    if (isDefined(samplers)) inferenceParams.setSamplers($(samplers).map(Sampler.valueOf): _*)
+    if (isDefined(seed)) inferenceParams.setSeed($(seed))
+    if (isDefined(stopStrings)) inferenceParams.setStopStrings($(stopStrings): _*)
+    if (isDefined(temperature)) inferenceParams.setTemperature($(temperature))
+    if (isDefined(tfsZ)) inferenceParams.setTfsZ($(tfsZ))
+    if (isDefined(topK)) inferenceParams.setTopK($(topK))
+    if (isDefined(topP)) inferenceParams.setTopP($(topP))
+    if (isDefined(typicalP)) inferenceParams.setTypicalP($(typicalP))
+    if (isDefined(useChatTemplate)) inferenceParams.setUseChatTemplate($(useChatTemplate))
+    if (tokenBias.isSet) {
+      val tokenBiasMap: mutable.Map[String, java.lang.Float] = mutable.Map($$(tokenBias).map {
+        case (key, value) => (key, float2Float(value))
+      }.toSeq: _*)
+      inferenceParams.setTokenBias(tokenBiasMap.asJava)
+    }
+    if (tokenIdBias.isSet) {
+      val tokenIdBiasMap: mutable.Map[Integer, java.lang.Float] =
+        mutable.Map($$(tokenIdBias).map { case (key, value) =>
+          (int2Integer(key), float2Float(value))
+        }.toSeq: _*)
+      inferenceParams.setTokenIdBias(tokenIdBiasMap.asJava)
+    }
+
+    inferenceParams
+  }
+
+  // ---------------- METADATA ----------------
+  val metadata =
+    new Param[String](this, "metadata", "Set the metadata for the model").setProtected()
+
+  /** Set the metadata for the model
+    * @group setParam
+    */
+  def setMetadata(metadata: String): this.type = { set(this.metadata, metadata) }
+
+  /** Get the metadata for the model
+    * @group getParam
+    */
+  def getMetadata: String = $(metadata)
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
index 98387ad04fb14a..1b46ec8330bc48 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
@@ -812,11 +812,13 @@ package object annotator {
 
   object UAEEmbeddings extends ReadablePretrainedUAEModel with ReadUAEDLModel
 
+  type AutoGGUFModel = com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
+  object AutoGGUFModel extends ReadablePretrainedAutoGGUFModel with ReadAutoGGUFModel
+
   type MxbaiEmbeddings =
     com.johnsnowlabs.nlp.embeddings.MxbaiEmbeddings
 
   object MxbaiEmbeddings extends ReadablePretrainedMxbaiModel with ReadMxbaiDLModel
-
   
   type SnowFlakeEmbeddings =
     com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
new file mode 100644
index 00000000000000..11d41bdb4d739a
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.ml.gguf.GGUFWrapper
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.nlp.llama.LlamaModel
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.SparkSession
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods
+
+/** Annotator that uses the llama.cpp library to generate text completions with large language
+  * models.
+  *
+  * For settable parameters, and their explanations, see [[HasLlamaCppProperties]] and refer to
+  * the llama.cpp documentation of
+  * [[https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server server.cpp]]
+  * for more information.
+  *
+  * If the parameters are not set, the annotator will default to use the parameters provided by
+  * the model.
+  *
+  * Pretrained models can be loaded with `pretrained` of the companion object:
+  * {{{
+  * val autoGGUFModel = AutoGGUFModel.pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("completions")
+  * }}}
+  * The default model is `"gguf-phi3-mini-4k-instruct-q4"`, if no name is provided.
+  *
+  * For available pretrained models please see the [[https://sparknlp.org/models Models Hub]].
+  *
+  * For extended examples of usage, see the
+  * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala AutoGGUFModelTest]]
+  * and the
+  * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFModel.ipynb example notebook]].
+  *
+  * ==Note==
+  * To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
+  * the number of GPU layers with the `setNGpuLayers` method.
+  *
+  * When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
+  * according to your hardware to avoid out-of-memory errors.
+  *
+  * ==Example==
+  *
+  * {{{
+  * import com.johnsnowlabs.nlp.base._
+  * import com.johnsnowlabs.nlp.annotator._
+  * import org.apache.spark.ml.Pipeline
+  * import spark.implicits._
+  *
+  * val document = new DocumentAssembler()
+  *   .setInputCol("text")
+  *   .setOutputCol("document")
+  *
+  * val autoGGUFModel = AutoGGUFModel
+  *   .pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("completions")
+  *   .setBatchSize(4)
+  *   .setNPredict(20)
+  *   .setNGpuLayers(99)
+  *   .setTemperature(0.4f)
+  *   .setTopK(40)
+  *   .setTopP(0.9f)
+  *   .setPenalizeNl(true)
+  *
+  * val pipeline = new Pipeline().setStages(Array(document, autoGGUFModel))
+  *
+  * val data = Seq("Hello, I am a").toDF("text")
+  * val result = pipeline.fit(data).transform(data)
+  * result.select("completions").show(truncate = false)
+  * +-----------------------------------------------------------------------------------------------------------------------------------+
+  * |completions                                                                                                                        |
+  * +-----------------------------------------------------------------------------------------------------------------------------------+
+  * |[{document, 0, 78,  new user.  I am currently working on a project and I need to create a list of , {prompt -> Hello, I am a}, []}]|
+  * +-----------------------------------------------------------------------------------------------------------------------------------+
+  * }}}
+  *
+  * @param uid
+  *   required uid for storing annotator to disk
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+class AutoGGUFModel(override val uid: String)
+    extends AnnotatorModel[AutoGGUFModel]
+    with HasBatchedAnnotate[AutoGGUFModel]
+    with HasEngine
+    with HasLlamaCppProperties
+    with HasProtectedParams {
+
+  override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.DOCUMENT)
+
+  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
+    * type
+    */
+  def this() = this(Identifiable.randomUID("AutoGGUFModel"))
+
+  private var _model: Option[Broadcast[GGUFWrapper]] = None
+
+  // Values for automatic GPU support
+  private val defaultGpuLayers = 1000
+  private val defaultMainGpu = 0
+
+  /** @group getParam */
+  def getModelIfNotSet: GGUFWrapper = _model.get.value
+
+  /** @group setParam */
+  def setModelIfNotSet(spark: SparkSession, wrapper: GGUFWrapper): this.type = {
+    if (_model.isEmpty) {
+      _model = Some(spark.sparkContext.broadcast(wrapper))
+    }
+
+    // Entrypoint for models. Automatically set GPU support if detected.
+    val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu"))
+    if (usingGPUJar) {
+      logger.info("Using GPU jar. Offloading all layers to GPU.")
+      setMainGpu(defaultMainGpu)
+      setNGpuLayers(defaultGpuLayers)
+    }
+    this
+  }
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    getModelIfNotSet.saveToFile(path)
+  }
+
+  /** Completes the batch of annotations.
+    *
+    * @param batchedAnnotations
+    *   Annotations (single element arrays) in batches
+    * @return
+    *   Completed text sequences
+    */
+  override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
+    val annotations: Seq[Annotation] = batchedAnnotations.flatten
+    if (annotations.nonEmpty) {
+
+      val modelParams =
+        getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size
+      val inferenceParams = getInferenceParameters
+
+      val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
+
+      val annotationsText = annotations.map(_.result)
+
+      val (completedTexts: Array[String], metadata: Map[String, String]) =
+        try {
+          (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty)
+        } catch {
+          case e: Exception =>
+            logger.error("Error in llama.cpp batch completion", e)
+            (Array[String](), Map("exception" -> e.getMessage))
+        }
+
+      val result: Seq[Seq[Annotation]] =
+        annotations.zip(completedTexts).map { case (annotation, text) =>
+          Seq(
+            new Annotation(
+              outputAnnotatorType,
+              0,
+              text.length - 1,
+              text,
+              annotation.metadata ++ metadata))
+        }
+      result
+    } else Seq(Seq.empty[Annotation])
+  }
+
+  def getMetadataMap: Map[String, String] = {
+    val metadataJsonString = getMetadata
+    if (metadataJsonString.isEmpty) Map.empty
+    else {
+      implicit val formats: DefaultFormats.type = DefaultFormats
+      JsonMethods.parse(metadataJsonString).extract[Map[String, String]]
+    }
+  }
+}
+
+trait ReadablePretrainedAutoGGUFModel
+    extends ParamsAndFeaturesReadable[AutoGGUFModel]
+    with HasPretrained[AutoGGUFModel] {
+  override val defaultModelName: Some[String] = Some("gguf-phi3-mini-4k-instruct-q4")
+  override val defaultLang: String = "en"
+
+  /** Java compliant-overrides */
+  override def pretrained(): AutoGGUFModel = super.pretrained()
+
+  override def pretrained(name: String): AutoGGUFModel = super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): AutoGGUFModel =
+    super.pretrained(name, lang)
+
+  override def pretrained(name: String, lang: String, remoteLoc: String): AutoGGUFModel =
+    super.pretrained(name, lang, remoteLoc)
+}
+
+trait ReadAutoGGUFModel {
+  this: ParamsAndFeaturesReadable[AutoGGUFModel] =>
+
+  def readModel(instance: AutoGGUFModel, path: String, spark: SparkSession): Unit = {
+    def findGGUFModelInFolder(): String = {
+      val folder = new java.io.File(path)
+      if (folder.exists && folder.isDirectory) {
+        folder.listFiles
+          .filter(_.isFile)
+          .filter(_.getName.endsWith(".gguf"))
+          .map(_.getAbsolutePath)
+          .headOption // Should only be one file
+          .getOrElse(throw new IllegalArgumentException(s"Could not find GGUF model in $path"))
+      } else {
+        throw new IllegalArgumentException(s"Path $path is not a directory")
+      }
+    }
+
+    val model = AutoGGUFModel.loadSavedModel(findGGUFModelInFolder(), spark)
+    instance.setModelIfNotSet(spark, model.getModelIfNotSet)
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(modelPath: String, spark: SparkSession): AutoGGUFModel = {
+    // TODO potentially enable download from HF-URLS
+    val localPath: String = ResourceHelper.copyToLocal(modelPath)
+    val annotatorModel = new AutoGGUFModel()
+    annotatorModel
+      .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath))
+
+    val metadata = LlamaModel.getMetadataFromFile(localPath)
+    if (metadata.nonEmpty) annotatorModel.setMetadata(metadata)
+    annotatorModel
+  }
+}
+
+/** This is the companion object of [[AutoGGUFModel]]. Please refer to that class for the
+  * documentation.
+  */
+object AutoGGUFModel extends ReadablePretrainedAutoGGUFModel with ReadAutoGGUFModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
index c4d887c3e03934..8ed41de985baa9 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
@@ -32,14 +32,7 @@ import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
 import com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel
 import com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentModel
 import com.johnsnowlabs.nlp.annotators.sentence_detector_dl.SentenceDetectorDLModel
-import com.johnsnowlabs.nlp.annotators.seq2seq.{
-  BartTransformer,
-  GPT2Transformer,
-  LLAMA2Transformer,
-  M2M100Transformer,
-  MarianTransformer,
-  T5Transformer
-}
+import com.johnsnowlabs.nlp.annotators.seq2seq._
 import com.johnsnowlabs.nlp.annotators.spell.context.ContextSpellCheckerModel
 import com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingModel
 import com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteModel
@@ -691,9 +684,10 @@ object PythonResourceDownloader {
     "LLAMA2Transformer" -> LLAMA2Transformer,
     "M2M100Transformer" -> M2M100Transformer,
     "UAEEmbeddings" -> UAEEmbeddings,
+    "AutoGGUFModel" -> AutoGGUFModel,
     "AlbertForZeroShotClassification" -> AlbertForZeroShotClassification,
     "MxbaiEmbeddings" -> MxbaiEmbeddings,
-    "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings,
+    "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings
   )
 
   // List pairs of types such as the one with key type can load a pretrained model from the value type
diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties
index 703f281a1da1d1..6a17297f6fda41 100644
--- a/src/test/resources/log4j.properties
+++ b/src/test/resources/log4j.properties
@@ -1,4 +1,4 @@
-log4j.rootLogger=WARN, STDOUT
+log4j.rootLogger=DEBUG, STDOUT
 log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n
diff --git a/src/test/resources/log4j2.properties b/src/test/resources/log4j2.properties
index 703f281a1da1d1..6a17297f6fda41 100644
--- a/src/test/resources/log4j2.properties
+++ b/src/test/resources/log4j2.properties
@@ -1,4 +1,4 @@
-log4j.rootLogger=WARN, STDOUT
+log4j.rootLogger=DEBUG, STDOUT
 log4j.appender.STDOUT=org.apache.log4j.ConsoleAppender
 log4j.appender.STDOUT.layout=org.apache.log4j.PatternLayout
 log4j.appender.STDOUT.layout.ConversionPattern=[%5p] %m%n
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
new file mode 100644
index 00000000000000..b4234f24197b7c
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
@@ -0,0 +1,187 @@
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.nlp.Annotation
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.tags.SlowTest
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.sql.DataFrame
+import org.scalatest.flatspec.AnyFlatSpec
+
+class AutoGGUFModelTest extends AnyFlatSpec {
+
+  import ResourceHelper.spark.implicits._
+
+  behavior of "AutoGGUFModelTest"
+
+  // Set Spark Debug level
+  ResourceHelper.spark.sparkContext.setLogLevel("INFO")
+
+  lazy val documentAssembler = new DocumentAssembler()
+    .setInputCol("text")
+    .setOutputCol("document")
+
+  lazy val model = AutoGGUFModel
+    .pretrained()
+    .setInputCols("document")
+    .setOutputCol("completions")
+    .setBatchSize(4)
+    .setNPredict(20)
+    .setNGpuLayers(99)
+    .setTemperature(0.4f)
+    .setNCtx(512)
+    .setTopK(40)
+    .setTopP(0.9f)
+    .setPenalizeNl(true)
+
+  lazy val data = Seq(
+    "The moons of Jupiter are ", // "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones. The four"
+    "Earth is ", // "Earth is 4.5 billion years old. It has been home to countless species, some of which have gone extinct, while others have evolved into"
+    "The moon is ", // "The moon is 1/400th the size of the sun. The sun is 1.39 million kilometers in diameter, while"
+    "The sun is " //
+  ).toDF("text").repartition(1)
+
+  lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model))
+
+  def assertAnnotationsNonEmpty(resultDf: DataFrame): Unit = {
+    Annotation
+      .collect(resultDf, "completions")
+      .foreach(annotations => {
+        println(annotations.head)
+        assert(annotations.head.result.nonEmpty)
+      })
+  }
+
+  it should "create completions" taggedAs SlowTest in {
+    val data = Seq("Hello, I am a").toDF("text")
+    val result = pipeline.fit(data).transform(data)
+    assertAnnotationsNonEmpty(result)
+  }
+
+  it should "create batch completions" taggedAs SlowTest in {
+    val pipeline = new Pipeline().setStages(Array(documentAssembler, model))
+    val result = pipeline.fit(data).transform(data)
+    assertAnnotationsNonEmpty(result)
+  }
+
+  it should "be serializable" taggedAs SlowTest in {
+    val data = Seq("Hello, I am a").toDF("text")
+    lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model))
+    model.setNPredict(5)
+
+    val pipelineModel = pipeline.fit(data)
+    val savePath = "./tmp_autogguf_model"
+    pipelineModel.stages.last
+      .asInstanceOf[AutoGGUFModel]
+      .write
+      .overwrite()
+      .save(savePath)
+
+    val loadedModel = AutoGGUFModel.load(savePath)
+    val newPipeline: Pipeline = new Pipeline().setStages(Array(documentAssembler, loadedModel))
+
+    newPipeline
+      .fit(data)
+      .transform(data)
+      .select("completions")
+      .show(truncate = false)
+  }
+
+  it should "accept all parameters that are settable" taggedAs SlowTest in {
+    // Model Parameters
+    model.setNThreads(8)
+    model.setNThreadsDraft(8)
+    model.setNThreadsBatch(8)
+    model.setNThreadsBatchDraft(8)
+    model.setNCtx(512)
+    model.setNBatch(32)
+    model.setNUbatch(32)
+    model.setNDraft(5)
+    model.setNChunks(-1)
+    model.setNSequences(1)
+    model.setPSplit(0.1f)
+    model.setNGpuLayers(99)
+    model.setNGpuLayersDraft(99)
+    model.setGpuSplitMode("NONE")
+    model.setMainGpu(0)
+    model.setTensorSplit(Array[Double]())
+    model.setGrpAttnN(1)
+    model.setGrpAttnW(512)
+    model.setRopeFreqBase(1.0f)
+    model.setRopeFreqScale(1.0f)
+    model.setYarnExtFactor(1.0f)
+    model.setYarnAttnFactor(1.0f)
+    model.setYarnBetaFast(32.0f)
+    model.setYarnBetaSlow(1.0f)
+    model.setYarnOrigCtx(0)
+    model.setDefragmentationThreshold(-1.0f)
+    model.setNumaStrategy("DISTRIBUTE")
+    model.setRopeScalingType("UNSPECIFIED")
+    model.setPoolingType("UNSPECIFIED")
+    model.setModelDraft("")
+    model.setLookupCacheStaticFilePath("/tmp/sparknlp-llama-cpp-cache")
+    model.setLookupCacheDynamicFilePath("/tmp/sparknlp-llama-cpp-cache")
+    model.setEmbedding(false)
+    model.setFlashAttention(false)
+    model.setInputPrefixBos(false)
+    model.setUseMmap(false)
+    model.setUseMlock(false)
+    model.setNoKvOffload(false)
+    model.setSystemPrompt("")
+    model.setChatTemplate("")
+
+    // Inference Parameters
+    model.setInputPrefix("")
+    model.setInputSuffix("")
+    model.setCachePrompt(false)
+    model.setNPredict(-1)
+    model.setTopK(40)
+    model.setTopP(0.9f)
+    model.setMinP(0.1f)
+    model.setTfsZ(1.0f)
+    model.setTypicalP(1.0f)
+    model.setTemperature(0.8f)
+    model.setDynamicTemperatureRange(0.0f)
+    model.setDynamicTemperatureExponent(1.0f)
+    model.setRepeatLastN(64)
+    model.setRepeatPenalty(1.0f)
+    model.setFrequencyPenalty(0.0f)
+    model.setPresencePenalty(0.0f)
+    model.setMiroStat("DISABLED")
+    model.setMiroStatTau(5.0f)
+    model.setMiroStatEta(0.1f)
+    model.setPenalizeNl(false)
+    model.setNKeep(0)
+    model.setSeed(-1)
+    model.setNProbs(0)
+    model.setMinKeep(0)
+    model.setGrammar("")
+    model.setPenaltyPrompt("")
+    model.setIgnoreEos(false)
+    model.setDisableTokenIds(Array[Int]())
+    model.setStopStrings(Array[String]())
+    model.setUseChatTemplate(false)
+    model.setNPredict(2)
+    model.setSamplers(Array("TOP_P", "TOP_K"))
+
+    // Struct Features
+    model.setTokenIdBias(Map(0 -> 0.0f, 1 -> 0.0f))
+    model.setTokenBias(Map("!" -> 0.0f, "?" -> 0.0f))
+    model.setLoraAdapters(Map(" " -> 0.0f))
+
+    lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model))
+
+    val result = pipeline.fit(data).transform(data)
+    result.select("completions").show(truncate = false)
+  }
+
+  it should "contain metadata when loadSavedModel" taggedAs SlowTest in {
+    lazy val modelPath = "models/codellama-7b.Q2_K.gguf"
+    val model = AutoGGUFModel.loadSavedModel(modelPath, ResourceHelper.spark)
+    val metadata = model.getMetadata
+    assert(metadata.nonEmpty)
+
+    val metadataMap = model.getMetadataMap
+    assert(metadataMap.nonEmpty)
+  }
+}