Merge branch 'master' into refactor/pipeline-builder-match

EricLBuehler · May 29, 2024 · 8549316 · 8549316
2 parents 3312ccc + 527e7f5
commit 8549316
Show file tree

Hide file tree

Showing 30 changed files with 638 additions and 131 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,7 +9,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.1.10"
+version = "0.1.11"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"

diff --git a/README.md b/README.md
@@ -155,7 +155,7 @@ Please submit more benchmarks via raising an issue!
 
 ## Usage
 ### Installation and Build
-To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` when using the server to enable automatic download of gated models.
+To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` by running `huggingface-cli login` to enable automatic download of gated models.
 
 1) Install required packages
     - `openssl` (ex., `sudo apt install libssl-dev`)
@@ -169,9 +169,7 @@ To install mistral.rs, one should ensure they have Rust installed by following [
 
 3) Set HF token correctly (skip if already set or your model is not gated, or if you want to use the `token_source` parameters in Python or the command line.)
     ```bash
-    mkdir ~/.cache/huggingface
-    touch ~/.cache/huggingface/token
-    echo <HF_TOKEN_HERE> > ~/.cache/huggingface/token
+    huggingface-cli login
     ```
 
 4) Download the code
@@ -220,7 +218,13 @@ To install mistral.rs, one should ensure they have Rust installed by following [
 
     You can install Python support by following the guide [here](mistralrs-pyo3/README.md).
 
-### Getting models from HF Hub
+## Getting models
+
+There are 2 ways to run a model with mistral.rs:
+- From Hugging Face Hub (easiest)
+- From local files
+
+### Getting models from Hugging Face Hub
 
 Mistral.rs can automatically download models from HF Hub. To access gated models, you should provide a token source. They may be one of:
 - `literal:<value>`: Load from a specified literal
@@ -240,17 +244,12 @@ This is passed in the following ways:
 
 If token cannot be loaded, no token will be used (i.e. effectively using `none`).
 
-## Loading models from local files:**
+### Loading models from local files:
 
-You can also instruct mistral.rs to load models locally by modifying the `*_model_id` arguments or options:
+You can also instruct mistral.rs to load models fully locally by modifying the `*_model_id` arguments or options:
 ```bash
 ./mistralrs_server --port 1234 plain -m . -a mistral
 ```
-or
-
-```bash
-./mistralrs-server gguf -m . -t . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
-```
 
 Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option:
 - `--model-id` (server) or `model_id` (python/rust) or `--tok-model-id` (server) or `tok_model_id` (python/rust): 
@@ -267,7 +266,22 @@ Throughout mistral.rs, any model ID argument or option may be a local path and s
 - `--adapters-model-id` (server) or `adapters_model_id` (python/rust):
   - Adapters `.safetensors` and `adapter_config.json` files in their respective directories
 
-### Run
+### Running GGUF models locally
+
+To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the
+chat template JSON file (examples [here](chat_templates), you will need to create your own by specifying the chat template and `bos`/`eos` tokens) as well as specifying a local model ID. For example:
+
+```bash
+./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
+```
+
+The following tokenizer model types are currently supported. If you would like one to be added, please raise an issue. Otherwise,
+please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face.
+
+**Supported GGUF tokenizer types**
+- `llama`
+
+## Run
 
 To start a server serving Mistral GGUF on `localhost:1234`, 
 ```bash
@@ -290,7 +304,7 @@ Additionally, for models without quantization, the model architecture should be
 You can launch interactive mode, a simple chat application running in the terminal, by passing `-i`:
 
 ```bash
-./mistralrs_server -i gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
+./mistralrs_server -i plain -m microsoft/Phi-3-mini-128k-instruct -a phi3
 ```
 
 ### Quick examples:
@@ -333,7 +347,7 @@ To start a server running Llama from GGML:
 To start a server running Mistral from safetensors.
 
 ```bash
-./mistralrs_server --port 1234 gguf -m mistralai/Mistral-7B-Instruct-v0.1
+./mistralrs_server --port 1234 plain -m mistralai/Mistral-7B-Instruct-v0.1 -a mistral
 ```
 
 ### Structured selection with a `.toml` file

diff --git a/chat_templates/llama2.json b/chat_templates/llama2.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+}
diff --git a/chat_templates/llama3.json b/chat_templates/llama3.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+}
diff --git a/chat_templates/mistral.json b/chat_templates/mistral.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+}
diff --git a/chat_templates/phi3.json b/chat_templates/phi3.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+}
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1,4 @@
+# Examples
+- Python: [examples here](python)
+- HTTP Server: [examples here](server)
+- Rust: [examples here](../mistralrs/examples/)
diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" }
 tracing.workspace = true
 either.workspace = true
 tokio.workspace = true

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
@@ -59,6 +59,7 @@ akin = "0.4.0"
 variantly = "0.4.0"
 buildstructor = "0.5.4"
 tracing-subscriber.workspace = true
+reqwest = { version = "0.12.4", features = ["blocking"] }
 
 [features]
 pyo3_macros = ["pyo3"]

diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
@@ -150,22 +150,19 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(arch),
         ModelSelected::GGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
-            Some(tok_model_id),
+            tok_model_id,
             quantized_model_id,
             quantized_filename,
         )
         .build(),
         ModelSelected::XLoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -175,7 +172,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
@@ -192,7 +188,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(),
         ModelSelected::LoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -201,7 +196,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,

diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
@@ -95,13 +95,11 @@ pub enum ModelSelected {
 
     /// Select a GGUF model.
     GGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
-        tok_model_id: String,
-
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
+        tok_model_id: Option<String>,
 
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
@@ -119,14 +117,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model with X-LoRA.
     XLoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: Option<String>,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]
@@ -156,14 +152,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model with LoRA.
     LoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: Option<String>,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]

diff --git a/mistralrs-core/src/pipeline/chat_template.rs b/mistralrs-core/src/pipeline/chat_template.rs
@@ -30,9 +30,9 @@ fn raise_exception(msg: String) -> Result<String, minijinja::Error> {
 }
 
 #[derive(Debug, Deserialize)]
-pub struct Unk(#[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>);
-#[derive(Debug, Deserialize)]
-pub struct Bos(#[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>);
+pub struct BeginEndUnkTok(
+    #[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>,
+);
 
 #[allow(dead_code)]
 #[derive(Debug, Deserialize)]
@@ -41,23 +41,22 @@ pub struct ChatTemplate {
     add_eos_token: Option<bool>,
     added_tokens_decoder: Option<HashMap<String, AddedTokensDecoder>>,
     additional_special_tokens: Option<Vec<String>>,
-    pub bos_token: Option<Bos>,
+    pub bos_token: Option<BeginEndUnkTok>,
 
     /// Jinja format chat templating for chat completion.
     /// See: https://huggingface.co/docs/transformers/chat_templating
     pub chat_template: Option<String>,
     clean_up_tokenization_spaces: Option<bool>,
     device_map: Option<String>,
-    #[serde(with = "either::serde_untagged")]
-    pub eos_token: Either<String, AddedTokensDecoder>,
+    pub eos_token: Option<BeginEndUnkTok>,
     legacy: Option<bool>,
-    model_max_length: f64,
+    model_max_length: Option<f64>,
     pad_token: Option<String>,
     sp_model_kwargs: Option<HashMap<String, String>>,
     spaces_between_special_tokens: Option<bool>,
-    tokenizer_class: String,
+    tokenizer_class: Option<String>,
     truncation_size: Option<String>,
-    pub unk_token: Option<Unk>,
+    pub unk_token: Option<BeginEndUnkTok>,
     use_default_system_prompt: Option<bool>,
 }
 
@@ -66,10 +65,10 @@ impl ChatTemplate {
         self.chat_template.is_some()
     }
 
-    pub fn eos_tok(&self) -> String {
-        match self.eos_token {
-            Either::Left(ref lit) => lit.clone(),
-            Either::Right(ref added) => added.content.clone(),
+    pub fn eos_tok(&self) -> Option<String> {
+        match self.eos_token.as_ref()?.0 {
+            Either::Left(ref lit) => Some(lit.clone()),
+            Either::Right(ref added) => Some(added.content.clone()),
         }
     }
 
@@ -93,7 +92,7 @@ pub fn calculate_eos_tokens(
     gen_conf: Option<GenerationConfig>,
     tokenizer: &Tokenizer,
 ) -> Vec<u32> {
-    let mut eos_tok_ids = vec![chat_template.eos_tok()];
+    let mut eos_tok_ids = chat_template.eos_tok().map(|x| vec![x]).unwrap_or_default();
     let mut bos_tok_ids = chat_template.bos_tok().map(|b| vec![b]).unwrap_or_default();
 
     for alternate in SUPPORTED_ALTERNATE_EOS {
@@ -173,7 +172,7 @@ pub fn apply_chat_template_to(
     add_generation_prompt: bool,
     template: &str,
     bos_tok: Option<String>,
-    eos_tok: &str,
+    eos_tok: Option<String>,
     unk_tok: Option<String>,
 ) -> Result<String> {
     let mut env = Environment::new();