From 3b029c52c58a63c5899dee3f14edb950aa69f60d Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 25 Oct 2023 14:04:58 -0700 Subject: [PATCH 1/6] feat: swtich Cpu backend to llama.cpp --- crates/llama-cpp-bindings/include/engine.h | 2 +- crates/llama-cpp-bindings/src/engine.cc | 4 ++-- crates/llama-cpp-bindings/src/lib.rs | 5 +++-- crates/tabby/src/serve/engine.rs | 8 ++++---- crates/tabby/src/serve/mod.rs | 2 +- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/crates/llama-cpp-bindings/include/engine.h b/crates/llama-cpp-bindings/include/engine.h index 834a1d753f81..fffc0a25586f 100644 --- a/crates/llama-cpp-bindings/include/engine.h +++ b/crates/llama-cpp-bindings/include/engine.h @@ -16,5 +16,5 @@ class TextInferenceEngine { virtual uint32_t eos_token() const = 0; }; -std::unique_ptr create_engine(rust::Str model_path); +std::unique_ptr create_engine(bool use_gpu, rust::Str model_path); } // namespace diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc index abcaee0e28db..7f3f2986cd2a 100644 --- a/crates/llama-cpp-bindings/src/engine.cc +++ b/crates/llama-cpp-bindings/src/engine.cc @@ -114,11 +114,11 @@ struct BackendInitializer { }; } // namespace -std::unique_ptr create_engine(rust::Str model_path) { +std::unique_ptr create_engine(bool use_gpu, rust::Str model_path) { static BackendInitializer initializer; llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = 1; + model_params.n_gpu_layers = use_gpu ? 1 : 0; llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params); if (!model) { diff --git a/crates/llama-cpp-bindings/src/lib.rs b/crates/llama-cpp-bindings/src/lib.rs index 084280b62c95..53870fc5abd3 100644 --- a/crates/llama-cpp-bindings/src/lib.rs +++ b/crates/llama-cpp-bindings/src/lib.rs @@ -15,7 +15,7 @@ mod ffi { type TextInferenceEngine; - fn create_engine(model_path: &str) -> UniquePtr; + fn create_engine(use_gpu: bool, model_path: &str) -> UniquePtr; fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]); fn step(self: Pin<&mut TextInferenceEngine>) -> Result; @@ -32,6 +32,7 @@ unsafe impl Sync for ffi::TextInferenceEngine {} pub struct LlamaEngineOptions { model_path: String, tokenizer_path: String, + use_gpu: bool, } pub struct LlamaEngine { @@ -42,7 +43,7 @@ pub struct LlamaEngine { impl LlamaEngine { pub fn create(options: LlamaEngineOptions) -> Self { - let engine = create_engine(&options.model_path); + let engine = create_engine(options.use_gpu, &options.model_path); if engine.is_null() { panic!("Unable to load model: {}", options.model_path); } diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs index 9eb86f91d767..c6c66288e817 100644 --- a/crates/tabby/src/serve/engine.rs +++ b/crates/tabby/src/serve/engine.rs @@ -54,10 +54,10 @@ fn create_local_engine( model_dir: &ModelDir, metadata: &Metadata, ) -> Box { - if args.device != super::Device::Metal { + if args.device != super::Device::Metal && args.device != super::Device::Cpu { create_ctranslate2_engine(args, model_dir, metadata) } else { - create_llama_engine(model_dir) + create_llama_engine(&args.device, model_dir) } } @@ -78,11 +78,11 @@ fn create_ctranslate2_engine( Box::new(CTranslate2Engine::create(options)) } -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] -fn create_llama_engine(model_dir: &ModelDir) -> Box { +fn create_llama_engine(device: &super::Device, model_dir: &ModelDir) -> Box { let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default() .model_path(model_dir.ggml_q8_0_file()) .tokenizer_path(model_dir.tokenizer_file()) + .use_gpu(*device == super::Device::Metal) .build() .unwrap(); diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs index 70faad8b2ef1..f7789cf522ca 100644 --- a/crates/tabby/src/serve/mod.rs +++ b/crates/tabby/src/serve/mod.rs @@ -122,7 +122,7 @@ fn should_download_ggml_files(_device: &Device) -> bool { #[cfg(all(target_os = "macos", target_arch = "aarch64"))] fn should_download_ggml_files(device: &Device) -> bool { - *device == Device::Metal + *device == Device::Metal || *device == Device::Cpu } pub async fn main(_config: &Config, args: &ServeArgs) { From e53dab2097b9910670c7dc0272596e4fc94c29de Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 25 Oct 2023 14:27:30 -0700 Subject: [PATCH 2/6] feat: switch cpu serving to ggml --- crates/tabby/Cargo.toml | 2 +- crates/tabby/src/serve/engine.rs | 19 +++++++++++-------- crates/tabby/src/serve/mod.rs | 26 ++++++++++++++------------ website/docs/models/index.md | 1 - 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 87da33f7fca7..4a5ee2872d2f 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -4,7 +4,7 @@ version = "0.5.0-dev" edition = "2021" [dependencies] -ctranslate2-bindings = { path = "../ctranslate2-bindings" } +ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true } tabby-common = { path = "../tabby-common" } tabby-scheduler = { path = "../tabby-scheduler" } tabby-download = { path = "../tabby-download" } diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs index c6c66288e817..8aa387b55a30 100644 --- a/crates/tabby/src/serve/engine.rs +++ b/crates/tabby/src/serve/engine.rs @@ -1,6 +1,5 @@ use std::path::Path; -use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder}; use serde::Deserialize; use tabby_common::path::ModelDir; use tabby_inference::TextGeneration; @@ -39,33 +38,36 @@ pub struct EngineInfo { pub chat_template: Option, } -#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] +#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))] fn create_local_engine( args: &crate::serve::ServeArgs, model_dir: &ModelDir, - metadata: &Metadata, + _metadata: &Metadata, ) -> Box { - create_ctranslate2_engine(args, model_dir, metadata) + create_llama_engine(&args.device, model_dir) } -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] +#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] fn create_local_engine( args: &crate::serve::ServeArgs, model_dir: &ModelDir, metadata: &Metadata, ) -> Box { - if args.device != super::Device::Metal && args.device != super::Device::Cpu { - create_ctranslate2_engine(args, model_dir, metadata) - } else { + if args.device.use_ggml_backend() { create_llama_engine(&args.device, model_dir) + } else { + create_ctranslate2_engine(args, model_dir, metadata) } } +#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] fn create_ctranslate2_engine( args: &crate::serve::ServeArgs, model_dir: &ModelDir, metadata: &Metadata, ) -> Box { + use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder}; + let device = format!("{}", args.device); let options = CTranslate2EngineOptionsBuilder::default() .model_path(model_dir.ctranslate2_dir()) @@ -99,6 +101,7 @@ fn get_model_dir(model: &str) -> ModelDir { #[derive(Deserialize)] struct Metadata { + #[allow(dead_code)] auto_model: String, prompt_template: Option, chat_template: Option, diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs index f7789cf522ca..4ed61d2ccfbb 100644 --- a/crates/tabby/src/serve/mod.rs +++ b/crates/tabby/src/serve/mod.rs @@ -74,7 +74,7 @@ pub enum Device { #[strum(serialize = "cpu")] Cpu, - #[strum(serialize = "cuda")] + #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] Cuda, #[cfg(all(target_os = "macos", target_arch = "aarch64"))] @@ -85,6 +85,18 @@ pub enum Device { ExperimentalHttp, } +impl Device { + #[cfg(all(target_os = "macos", target_arch = "aarch64"))] + fn use_ggml_backend(&self) -> bool { + *self == Device::Metal || *self == Device::Cpu + } + + #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] + fn use_ggml_backend(&self) -> bool { + *self == Device::Cpu + } +} + #[derive(Args)] pub struct ServeArgs { /// Model id for `/completions` API endpoint. @@ -115,16 +127,6 @@ pub struct ServeArgs { compute_type: Option, } -#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] -fn should_download_ggml_files(_device: &Device) -> bool { - false -} - -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] -fn should_download_ggml_files(device: &Device) -> bool { - *device == Device::Metal || *device == Device::Cpu -} - pub async fn main(_config: &Config, args: &ServeArgs) { valid_args(args); @@ -273,7 +275,7 @@ fn start_heartbeat(args: &ServeArgs) { async fn download_model(model: &str, device: &Device) { let downloader = Downloader::new(model, /* prefer_local_file= */ true); let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,); - let download_result = if should_download_ggml_files(device) { + let download_result = if device.use_ggml_backend() { downloader.download_ggml_files().await } else { downloader.download_ctranslate2_files().await diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 9463efa653f7..f0530d3880b5 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -17,7 +17,6 @@ We recommend using | [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ | | [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ | | [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ | -| [TabbyML/J-350M](https://huggingface.co/TabbyML/J-350M) | [BSD-3](https://opensource.org/license/bsd-3-clause/) | ❌ | ❌ | ## Chat models (`--chat-model`) From a63669e2c5326c84e2195963a7b3e973fed40ebd Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 25 Oct 2023 14:31:07 -0700 Subject: [PATCH 3/6] fix cargo.toml --- crates/tabby/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 4a5ee2872d2f..2e4748df5a03 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -4,7 +4,6 @@ version = "0.5.0-dev" edition = "2021" [dependencies] -ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true } tabby-common = { path = "../tabby-common" } tabby-scheduler = { path = "../tabby-scheduler" } tabby-download = { path = "../tabby-download" } @@ -43,10 +42,11 @@ minijinja = { version = "1.0.8", features = ["loader"] } textdistance = "1.0.2" regex.workspace = true thiserror.workspace = true - -[target.'cfg(all(target_os="macos", target_arch="aarch64"))'.dependencies] llama-cpp-bindings = { path = "../llama-cpp-bindings" } +[target.'cfg(any(feature="link_shared", feature="link_cuda_static"))'.dependencies] +ctranslate2-bindings = { path = "../ctranslate2-bindings" } + [dependencies.uuid] version = "1.3.3" features = [ From f770a2021ede9053870359143bcd40769636eda9 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 25 Oct 2023 14:38:03 -0700 Subject: [PATCH 4/6] use optional dependency --- crates/tabby/Cargo.toml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 2e4748df5a03..37bc80967e69 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -43,9 +43,7 @@ textdistance = "1.0.2" regex.workspace = true thiserror.workspace = true llama-cpp-bindings = { path = "../llama-cpp-bindings" } - -[target.'cfg(any(feature="link_shared", feature="link_cuda_static"))'.dependencies] -ctranslate2-bindings = { path = "../ctranslate2-bindings" } +ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true } [dependencies.uuid] version = "1.3.3" @@ -57,6 +55,7 @@ features = [ [features] link_shared = ["ctranslate2-bindings/link_shared"] +link_cuda_static = ["ctranslate2-bindings"] [build-dependencies] vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] } From 6795467201e3cf1eacea1cf75db4886a0779d545 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 25 Oct 2023 14:49:47 -0700 Subject: [PATCH 5/6] fix compliation --- CHANGELOG.md | 2 ++ crates/tabby/src/serve/engine.rs | 8 ++++---- crates/tabby/src/serve/mod.rs | 10 ++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc32b6a58dfb..744449f2dafb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ ## Features ## Fixes and Improvements +* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638 +* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637 # v0.4.0 diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs index 8aa387b55a30..8675bf32b097 100644 --- a/crates/tabby/src/serve/engine.rs +++ b/crates/tabby/src/serve/engine.rs @@ -44,7 +44,7 @@ fn create_local_engine( model_dir: &ModelDir, _metadata: &Metadata, ) -> Box { - create_llama_engine(&args.device, model_dir) + create_ggml_engine(&args.device, model_dir) } #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))] @@ -54,7 +54,7 @@ fn create_local_engine( metadata: &Metadata, ) -> Box { if args.device.use_ggml_backend() { - create_llama_engine(&args.device, model_dir) + create_ggml_engine(&args.device, model_dir) } else { create_ctranslate2_engine(args, model_dir, metadata) } @@ -80,11 +80,11 @@ fn create_ctranslate2_engine( Box::new(CTranslate2Engine::create(options)) } -fn create_llama_engine(device: &super::Device, model_dir: &ModelDir) -> Box { +fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box { let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default() .model_path(model_dir.ggml_q8_0_file()) .tokenizer_path(model_dir.tokenizer_file()) - .use_gpu(*device == super::Device::Metal) + .use_gpu(device.ggml_use_gpu()) .build() .unwrap(); diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs index 4ed61d2ccfbb..f3bef8433766 100644 --- a/crates/tabby/src/serve/mod.rs +++ b/crates/tabby/src/serve/mod.rs @@ -95,6 +95,16 @@ impl Device { fn use_ggml_backend(&self) -> bool { *self == Device::Cpu } + + #[cfg(all(target_os = "macos", target_arch = "aarch64"))] + fn ggml_use_gpu(&self) -> bool { + *self == Device::Metal + } + + #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))] + fn ggml_use_gpu(&self) -> bool { + false + } } #[derive(Args)] From a22b0aa97f67cdb61a5b2c4d20417da91d725c6f Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Wed, 25 Oct 2023 15:08:36 -0700 Subject: [PATCH 6/6] update ci target --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c37cf5c6390f..5b1cef0ff83f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,7 +111,7 @@ jobs: - run: bash ./ci/prepare_build_environment.sh - name: Bulid release binary - run: cargo build --no-default-features --release --target ${{ matrix.target }} + run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby - name: Rename release binary run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}