From 3b029c52c58a63c5899dee3f14edb950aa69f60d Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Wed, 25 Oct 2023 14:04:58 -0700
Subject: [PATCH 1/6] feat: swtich Cpu backend to llama.cpp

---
 crates/llama-cpp-bindings/include/engine.h | 2 +-
 crates/llama-cpp-bindings/src/engine.cc    | 4 ++--
 crates/llama-cpp-bindings/src/lib.rs       | 5 +++--
 crates/tabby/src/serve/engine.rs           | 8 ++++----
 crates/tabby/src/serve/mod.rs              | 2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/crates/llama-cpp-bindings/include/engine.h b/crates/llama-cpp-bindings/include/engine.h
index 834a1d753f81..fffc0a25586f 100644
--- a/crates/llama-cpp-bindings/include/engine.h
+++ b/crates/llama-cpp-bindings/include/engine.h
@@ -16,5 +16,5 @@ class TextInferenceEngine {
   virtual uint32_t eos_token() const = 0;
 };
 
-std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path);
+std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path);
 }  // namespace
diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc
index abcaee0e28db..7f3f2986cd2a 100644
--- a/crates/llama-cpp-bindings/src/engine.cc
+++ b/crates/llama-cpp-bindings/src/engine.cc
@@ -114,11 +114,11 @@ struct BackendInitializer {
 };
 } // namespace
 
-std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {
+std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path) {
   static BackendInitializer initializer;
 
   llama_model_params model_params = llama_model_default_params();
-  model_params.n_gpu_layers = 1;
+  model_params.n_gpu_layers = use_gpu ? 1 : 0;
   llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);
 
   if (!model) {
diff --git a/crates/llama-cpp-bindings/src/lib.rs b/crates/llama-cpp-bindings/src/lib.rs
index 084280b62c95..53870fc5abd3 100644
--- a/crates/llama-cpp-bindings/src/lib.rs
+++ b/crates/llama-cpp-bindings/src/lib.rs
@@ -15,7 +15,7 @@ mod ffi {
 
         type TextInferenceEngine;
 
-        fn create_engine(model_path: &str) -> UniquePtr<TextInferenceEngine>;
+        fn create_engine(use_gpu: bool, model_path: &str) -> UniquePtr<TextInferenceEngine>;
 
         fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]);
         fn step(self: Pin<&mut TextInferenceEngine>) -> Result<u32>;
@@ -32,6 +32,7 @@ unsafe impl Sync for ffi::TextInferenceEngine {}
 pub struct LlamaEngineOptions {
     model_path: String,
     tokenizer_path: String,
+    use_gpu: bool,
 }
 
 pub struct LlamaEngine {
@@ -42,7 +43,7 @@ pub struct LlamaEngine {
 
 impl LlamaEngine {
     pub fn create(options: LlamaEngineOptions) -> Self {
-        let engine = create_engine(&options.model_path);
+        let engine = create_engine(options.use_gpu, &options.model_path);
         if engine.is_null() {
             panic!("Unable to load model: {}", options.model_path);
         }
diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs
index 9eb86f91d767..c6c66288e817 100644
--- a/crates/tabby/src/serve/engine.rs
+++ b/crates/tabby/src/serve/engine.rs
@@ -54,10 +54,10 @@ fn create_local_engine(
     model_dir: &ModelDir,
     metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
-    if args.device != super::Device::Metal {
+    if args.device != super::Device::Metal && args.device != super::Device::Cpu {
         create_ctranslate2_engine(args, model_dir, metadata)
     } else {
-        create_llama_engine(model_dir)
+        create_llama_engine(&args.device, model_dir)
     }
 }
 
@@ -78,11 +78,11 @@ fn create_ctranslate2_engine(
     Box::new(CTranslate2Engine::create(options))
 }
 
-#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-fn create_llama_engine(model_dir: &ModelDir) -> Box<dyn TextGeneration> {
+fn create_llama_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
     let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
         .model_path(model_dir.ggml_q8_0_file())
         .tokenizer_path(model_dir.tokenizer_file())
+        .use_gpu(*device == super::Device::Metal)
         .build()
         .unwrap();
 
diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs
index 70faad8b2ef1..f7789cf522ca 100644
--- a/crates/tabby/src/serve/mod.rs
+++ b/crates/tabby/src/serve/mod.rs
@@ -122,7 +122,7 @@ fn should_download_ggml_files(_device: &Device) -> bool {
 
 #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
 fn should_download_ggml_files(device: &Device) -> bool {
-    *device == Device::Metal
+    *device == Device::Metal || *device == Device::Cpu
 }
 
 pub async fn main(_config: &Config, args: &ServeArgs) {

From e53dab2097b9910670c7dc0272596e4fc94c29de Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Wed, 25 Oct 2023 14:27:30 -0700
Subject: [PATCH 2/6] feat: switch cpu serving to ggml

---
 crates/tabby/Cargo.toml          |  2 +-
 crates/tabby/src/serve/engine.rs | 19 +++++++++++--------
 crates/tabby/src/serve/mod.rs    | 26 ++++++++++++++------------
 website/docs/models/index.md     |  1 -
 4 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml
index 87da33f7fca7..4a5ee2872d2f 100644
--- a/crates/tabby/Cargo.toml
+++ b/crates/tabby/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.5.0-dev"
 edition = "2021"
 
 [dependencies]
-ctranslate2-bindings = { path = "../ctranslate2-bindings" }
+ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true }
 tabby-common = { path = "../tabby-common" }
 tabby-scheduler = { path = "../tabby-scheduler" }
 tabby-download = { path = "../tabby-download" }
diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs
index c6c66288e817..8aa387b55a30 100644
--- a/crates/tabby/src/serve/engine.rs
+++ b/crates/tabby/src/serve/engine.rs
@@ -1,6 +1,5 @@
 use std::path::Path;
 
-use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
 use serde::Deserialize;
 use tabby_common::path::ModelDir;
 use tabby_inference::TextGeneration;
@@ -39,33 +38,36 @@ pub struct EngineInfo {
     pub chat_template: Option<String>,
 }
 
-#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))]
 fn create_local_engine(
     args: &crate::serve::ServeArgs,
     model_dir: &ModelDir,
-    metadata: &Metadata,
+    _metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
-    create_ctranslate2_engine(args, model_dir, metadata)
+    create_llama_engine(&args.device, model_dir)
 }
 
-#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
 fn create_local_engine(
     args: &crate::serve::ServeArgs,
     model_dir: &ModelDir,
     metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
-    if args.device != super::Device::Metal && args.device != super::Device::Cpu {
-        create_ctranslate2_engine(args, model_dir, metadata)
-    } else {
+    if args.device.use_ggml_backend() {
         create_llama_engine(&args.device, model_dir)
+    } else {
+        create_ctranslate2_engine(args, model_dir, metadata)
     }
 }
 
+#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
 fn create_ctranslate2_engine(
     args: &crate::serve::ServeArgs,
     model_dir: &ModelDir,
     metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
+    use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
+
     let device = format!("{}", args.device);
     let options = CTranslate2EngineOptionsBuilder::default()
         .model_path(model_dir.ctranslate2_dir())
@@ -99,6 +101,7 @@ fn get_model_dir(model: &str) -> ModelDir {
 
 #[derive(Deserialize)]
 struct Metadata {
+    #[allow(dead_code)]
     auto_model: String,
     prompt_template: Option<String>,
     chat_template: Option<String>,
diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs
index f7789cf522ca..4ed61d2ccfbb 100644
--- a/crates/tabby/src/serve/mod.rs
+++ b/crates/tabby/src/serve/mod.rs
@@ -74,7 +74,7 @@ pub enum Device {
     #[strum(serialize = "cpu")]
     Cpu,
 
-    #[strum(serialize = "cuda")]
+    #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
     Cuda,
 
     #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
@@ -85,6 +85,18 @@ pub enum Device {
     ExperimentalHttp,
 }
 
+impl Device {
+    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+    fn use_ggml_backend(&self) -> bool {
+        *self == Device::Metal || *self == Device::Cpu
+    }
+
+    #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+    fn use_ggml_backend(&self) -> bool {
+        *self == Device::Cpu
+    }
+}
+
 #[derive(Args)]
 pub struct ServeArgs {
     /// Model id for `/completions` API endpoint.
@@ -115,16 +127,6 @@ pub struct ServeArgs {
     compute_type: Option<String>,
 }
 
-#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-fn should_download_ggml_files(_device: &Device) -> bool {
-    false
-}
-
-#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-fn should_download_ggml_files(device: &Device) -> bool {
-    *device == Device::Metal || *device == Device::Cpu
-}
-
 pub async fn main(_config: &Config, args: &ServeArgs) {
     valid_args(args);
 
@@ -273,7 +275,7 @@ fn start_heartbeat(args: &ServeArgs) {
 async fn download_model(model: &str, device: &Device) {
     let downloader = Downloader::new(model, /* prefer_local_file= */ true);
     let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,);
-    let download_result = if should_download_ggml_files(device) {
+    let download_result = if device.use_ggml_backend() {
         downloader.download_ggml_files().await
     } else {
         downloader.download_ctranslate2_files().await
diff --git a/website/docs/models/index.md b/website/docs/models/index.md
index 9463efa653f7..f0530d3880b5 100644
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@@ -17,7 +17,6 @@ We recommend using
 | [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B)   | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) |        ✅         |       ✅       |
 | [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B)   | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) |        ✅         |       ✅       |
 | [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B)   | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) |        ✅         |       ✅       |
-| [TabbyML/J-350M](https://huggingface.co/TabbyML/J-350M)               |                    [BSD-3](https://opensource.org/license/bsd-3-clause/)                    |        ❌         |       ❌       |
 
 ## Chat models (`--chat-model`)
 

From a63669e2c5326c84e2195963a7b3e973fed40ebd Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Wed, 25 Oct 2023 14:31:07 -0700
Subject: [PATCH 3/6] fix cargo.toml

---
 crates/tabby/Cargo.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml
index 4a5ee2872d2f..2e4748df5a03 100644
--- a/crates/tabby/Cargo.toml
+++ b/crates/tabby/Cargo.toml
@@ -4,7 +4,6 @@ version = "0.5.0-dev"
 edition = "2021"
 
 [dependencies]
-ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true }
 tabby-common = { path = "../tabby-common" }
 tabby-scheduler = { path = "../tabby-scheduler" }
 tabby-download = { path = "../tabby-download" }
@@ -43,10 +42,11 @@ minijinja = { version = "1.0.8", features = ["loader"] }
 textdistance = "1.0.2"
 regex.workspace = true
 thiserror.workspace = true
-
-[target.'cfg(all(target_os="macos", target_arch="aarch64"))'.dependencies]
 llama-cpp-bindings = { path = "../llama-cpp-bindings" }
 
+[target.'cfg(any(feature="link_shared", feature="link_cuda_static"))'.dependencies]
+ctranslate2-bindings = { path = "../ctranslate2-bindings" }
+
 [dependencies.uuid]
 version = "1.3.3"
 features = [

From f770a2021ede9053870359143bcd40769636eda9 Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Wed, 25 Oct 2023 14:38:03 -0700
Subject: [PATCH 4/6] use optional dependency

---
 crates/tabby/Cargo.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml
index 2e4748df5a03..37bc80967e69 100644
--- a/crates/tabby/Cargo.toml
+++ b/crates/tabby/Cargo.toml
@@ -43,9 +43,7 @@ textdistance = "1.0.2"
 regex.workspace = true
 thiserror.workspace = true
 llama-cpp-bindings = { path = "../llama-cpp-bindings" }
-
-[target.'cfg(any(feature="link_shared", feature="link_cuda_static"))'.dependencies]
-ctranslate2-bindings = { path = "../ctranslate2-bindings" }
+ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true }
 
 [dependencies.uuid]
 version = "1.3.3"
@@ -57,6 +55,7 @@ features = [
 
 [features]
 link_shared = ["ctranslate2-bindings/link_shared"]
+link_cuda_static = ["ctranslate2-bindings"]
 
 [build-dependencies]
 vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }

From 6795467201e3cf1eacea1cf75db4886a0779d545 Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Wed, 25 Oct 2023 14:49:47 -0700
Subject: [PATCH 5/6] fix compliation

---
 CHANGELOG.md                     |  2 ++
 crates/tabby/src/serve/engine.rs |  8 ++++----
 crates/tabby/src/serve/mod.rs    | 10 ++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fc32b6a58dfb..744449f2dafb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,8 @@
 ## Features
 
 ## Fixes and Improvements
+* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
+* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
 
 # v0.4.0
 
diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs
index 8aa387b55a30..8675bf32b097 100644
--- a/crates/tabby/src/serve/engine.rs
+++ b/crates/tabby/src/serve/engine.rs
@@ -44,7 +44,7 @@ fn create_local_engine(
     model_dir: &ModelDir,
     _metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
-    create_llama_engine(&args.device, model_dir)
+    create_ggml_engine(&args.device, model_dir)
 }
 
 #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
@@ -54,7 +54,7 @@ fn create_local_engine(
     metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
     if args.device.use_ggml_backend() {
-        create_llama_engine(&args.device, model_dir)
+        create_ggml_engine(&args.device, model_dir)
     } else {
         create_ctranslate2_engine(args, model_dir, metadata)
     }
@@ -80,11 +80,11 @@ fn create_ctranslate2_engine(
     Box::new(CTranslate2Engine::create(options))
 }
 
-fn create_llama_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
+fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
     let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
         .model_path(model_dir.ggml_q8_0_file())
         .tokenizer_path(model_dir.tokenizer_file())
-        .use_gpu(*device == super::Device::Metal)
+        .use_gpu(device.ggml_use_gpu())
         .build()
         .unwrap();
 
diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs
index 4ed61d2ccfbb..f3bef8433766 100644
--- a/crates/tabby/src/serve/mod.rs
+++ b/crates/tabby/src/serve/mod.rs
@@ -95,6 +95,16 @@ impl Device {
     fn use_ggml_backend(&self) -> bool {
         *self == Device::Cpu
     }
+
+    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+    fn ggml_use_gpu(&self) -> bool {
+        *self == Device::Metal
+    }
+
+    #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+    fn ggml_use_gpu(&self) -> bool {
+        false
+    }
 }
 
 #[derive(Args)]

From a22b0aa97f67cdb61a5b2c4d20417da91d725c6f Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Wed, 25 Oct 2023 15:08:36 -0700
Subject: [PATCH 6/6] update ci target

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c37cf5c6390f..5b1cef0ff83f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -111,7 +111,7 @@ jobs:
       - run: bash ./ci/prepare_build_environment.sh
 
       - name: Bulid release binary
-        run: cargo build --no-default-features --release --target ${{ matrix.target }}
+        run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby
 
       - name: Rename release binary
         run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}