From 1c2f5d7a1e67d0f5e1f0287bedadb8b2dbc175b7 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Sat, 25 May 2024 10:56:23 -0400
Subject: [PATCH 01/23] Add automatic conversion from gguf to hf tokenizer

---
 mistralrs-core/src/pipeline/gguf.rs           |  6 +-
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 92 +++++++++++++++++++
 mistralrs-core/src/pipeline/mod.rs            |  1 +
 3 files changed, 96 insertions(+), 3 deletions(-)
 create mode 100644 mistralrs-core/src/pipeline/gguf_tokenizer.rs

diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
index eb3a7ac09c..ae40ec16ae 100644
--- a/mistralrs-core/src/pipeline/gguf.rs
+++ b/mistralrs-core/src/pipeline/gguf.rs
@@ -7,11 +7,11 @@ use crate::aici::bintokens::build_tok_trie;
 use crate::aici::toktree::TokTrie;
 use crate::lora::Ordering;
 use crate::pipeline::chat_template::calculate_eos_tokens;
+use crate::pipeline::gguf_tokenizer::convert_ggml_to_hf_tokenizer;
 use crate::pipeline::Cache;
 use crate::pipeline::{ChatTemplate, LocalModelPaths};
 use crate::prefix_cacher::PrefixCacheManager;
 use crate::sequence::Sequence;
-use crate::utils::tokenizer::get_tokenizer;
 use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters};
 use crate::xlora_models::NonGranularState;
 use crate::{deserialize_chat_template, do_sample, get_mut_arcmutex, get_paths, DeviceMapMetadata};
@@ -329,6 +329,8 @@ impl Loader for GGUFLoader {
             }
         }
 
+        let tokenizer = convert_ggml_to_hf_tokenizer(&model)?;
+
         let mut is_lora = false;
         let model = match self.kind {
             ModelKind::QuantizedGGUF => match arch {
@@ -449,8 +451,6 @@ impl Loader for GGUFLoader {
             _ => unreachable!(),
         };
 
-        let tokenizer = get_tokenizer(paths.get_tokenizer_filename())?;
-
         let (chat_template, gen_conf) = deserialize_chat_template!(paths, self);
 
         let max_seq_len = match model {
diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
new file mode 100644
index 0000000000..246c7bbd94
--- /dev/null
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -0,0 +1,92 @@
+use std::collections::HashMap;
+
+use anyhow::Result;
+use candle_core::quantized::gguf_file::Content;
+use tokenizers::{models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer};
+
+pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
+    let model = content.metadata["tokenizer.ggml.model"]
+        .to_string()
+        .expect("GGUF tokenizer model is not a string.")
+        .clone();
+    let tokens = content.metadata["tokenizer.ggml.tokens"]
+        .to_vec()
+        .expect("GGUF tokenizer tokens is not a vec.")
+        .iter()
+        .map(|t| t.to_string().expect("GGUF token is not a string.").clone())
+        .collect::<Vec<_>>();
+    let added_tokens = content
+        .metadata
+        .get("tokenizer.ggml.added_tokens")
+        .map(|items| {
+            items
+                .to_vec()
+                .expect("GGUF tokenizer added_tokens is not a vec.")
+                .iter()
+                .map(|t| {
+                    t.to_string()
+                        .expect("GGUF added_token is not a string.")
+                        .clone()
+                })
+                .collect::<Vec<_>>()
+        });
+    let merges = content.metadata.get("tokenizer.ggml.merges").map(|items| {
+        items
+            .to_vec()
+            .expect("GGUF tokenizer merges is not a vec.")
+            .iter()
+            .map(|t| t.to_string().expect("GGUF merges is not a string.").clone())
+            .collect::<Vec<_>>()
+    });
+
+    let _bos = content.metadata["tokenizer.ggml.bos_token_id"]
+        .to_u32()
+        .expect("GGUF bos token is not u32");
+    let _eos = content.metadata["tokenizer.ggml.eos_token_id"]
+        .to_u32()
+        .expect("GGUF eos token is not u32");
+    let unk = content.metadata["tokenizer.ggml.unknown_token_id"]
+        .to_u32()
+        .expect("GGUF unk token is not u32");
+    let _sep = content.metadata["tokenizer.ggml.separator_token_id"]
+        .to_u32()
+        .expect("GGUF sep token is not u32");
+    let _pad = content.metadata["tokenizer.ggml.padding_token_id"]
+        .to_u32()
+        .expect("GGUF pad token is not u32");
+
+    let tokenizer = match model.as_str() {
+        "llama" | "replit" | "gpt2" | "rwkv" => {
+            // BPE, as seen in relevant tokenizer.json files
+            let bpe_builder = BpeBuilder::new().unk_token(tokens[unk as usize].clone());
+
+            let mut vocab = HashMap::new();
+            for (i, tok) in tokens.into_iter().enumerate() {
+                #[allow(clippy::cast_possible_truncation)]
+                vocab.insert(tok, i as u32);
+            }
+            let mut merges_vec = Vec::new();
+            if let Some(merges) = merges {
+                for tok in merges {
+                    let split = tok.splitn(2, ' ').collect::<Vec<_>>();
+                    merges_vec.push((split[0].to_string(), split[1].to_string()));
+                }
+            }
+            let bpe = bpe_builder
+                .vocab_and_merges(vocab, merges_vec)
+                .build()
+                .map_err(anyhow::Error::msg)?;
+            let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe));
+            if let Some(added_tokens) = added_tokens {
+                for added_token in added_tokens {
+                    tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]);
+                }
+            }
+            tokenizer
+        }
+        other => {
+            anyhow::bail!("Tokenizer model `{other}` not supported.");
+        }
+    };
+    Ok(tokenizer)
+}
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index c2c5512ff3..6b61dd0ea2 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -2,6 +2,7 @@ mod cache_manager;
 mod chat_template;
 mod ggml;
 mod gguf;
+mod gguf_tokenizer;
 mod loaders;
 mod macros;
 mod normal;

From b3ac5c80e3d98e5572a9e28544984733365ab4fa Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Sat, 25 May 2024 11:00:24 -0400
Subject: [PATCH 02/23] Add info messages

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 246c7bbd94..27c2cdf6a6 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -3,6 +3,7 @@ use std::collections::HashMap;
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
 use tokenizers::{models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer};
+use tracing::info;
 
 pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
     let model = content.metadata["tokenizer.ggml.model"]
@@ -39,6 +40,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
             .collect::<Vec<_>>()
     });
 
+    info!(
+        "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}",
+        tokens.len(),
+        added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
+        merges.as_ref().map(|x| x.len()).unwrap_or(0)
+    );
     let _bos = content.metadata["tokenizer.ggml.bos_token_id"]
         .to_u32()
         .expect("GGUF bos token is not u32");
@@ -59,6 +66,7 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         "llama" | "replit" | "gpt2" | "rwkv" => {
             // BPE, as seen in relevant tokenizer.json files
             let bpe_builder = BpeBuilder::new().unk_token(tokens[unk as usize].clone());
+            info!("Loading as BPE tokenizer.");
 
             let mut vocab = HashMap::new();
             for (i, tok) in tokens.into_iter().enumerate() {

From 36c46cc602933b96c8627ae2bab9d9d862a9ba0d Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Sat, 25 May 2024 11:12:01 -0400
Subject: [PATCH 03/23] Add decoder to tokenizer

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 27c2cdf6a6..66fdf4349f 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -2,7 +2,9 @@ use std::collections::HashMap;
 
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
-use tokenizers::{models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer};
+use tokenizers::{
+    decoders::bpe::BPEDecoder, models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer,
+};
 use tracing::info;
 
 pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
@@ -46,21 +48,9 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
         merges.as_ref().map(|x| x.len()).unwrap_or(0)
     );
-    let _bos = content.metadata["tokenizer.ggml.bos_token_id"]
-        .to_u32()
-        .expect("GGUF bos token is not u32");
-    let _eos = content.metadata["tokenizer.ggml.eos_token_id"]
-        .to_u32()
-        .expect("GGUF eos token is not u32");
     let unk = content.metadata["tokenizer.ggml.unknown_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
-    let _sep = content.metadata["tokenizer.ggml.separator_token_id"]
-        .to_u32()
-        .expect("GGUF sep token is not u32");
-    let _pad = content.metadata["tokenizer.ggml.padding_token_id"]
-        .to_u32()
-        .expect("GGUF pad token is not u32");
 
     let tokenizer = match model.as_str() {
         "llama" | "replit" | "gpt2" | "rwkv" => {
@@ -85,6 +75,7 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
                 .build()
                 .map_err(anyhow::Error::msg)?;
             let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe));
+            tokenizer.with_decoder(BPEDecoder::default());
             if let Some(added_tokens) = added_tokens {
                 for added_token in added_tokens {
                     tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]);

From be2fca1be34856c0e64bbb1a5f16922707d206a2 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Sat, 25 May 2024 16:47:37 -0400
Subject: [PATCH 04/23] More progress, its horrifying

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 58 +++++++++++++++++--
 1 file changed, 54 insertions(+), 4 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 66fdf4349f..01e7004639 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -3,7 +3,11 @@ use std::collections::HashMap;
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
 use tokenizers::{
-    decoders::bpe::BPEDecoder, models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer,
+    decoders::{byte_fallback::ByteFallback, fuse::Fuse, sequence::Sequence, strip::Strip},
+    models::bpe::BpeBuilder,
+    normalizers::{self, Prepend, Replace},
+    processors::template::{self, Template, TemplateProcessing, Tokens},
+    AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
 };
 use tracing::info;
 
@@ -52,6 +56,14 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         .to_u32()
         .expect("GGUF unk token is not u32");
 
+    let eos = content.metadata["tokenizer.ggml.eos_token_id"]
+        .to_u32()
+        .expect("GGUF unk token is not u32");
+
+    let bos = content.metadata["tokenizer.ggml.bos_token_id"]
+        .to_u32()
+        .expect("GGUF unk token is not u32");
+
     let tokenizer = match model.as_str() {
         "llama" | "replit" | "gpt2" | "rwkv" => {
             // BPE, as seen in relevant tokenizer.json files
@@ -59,9 +71,9 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
             info!("Loading as BPE tokenizer.");
 
             let mut vocab = HashMap::new();
-            for (i, tok) in tokens.into_iter().enumerate() {
+            for (i, tok) in tokens.iter().enumerate() {
                 #[allow(clippy::cast_possible_truncation)]
-                vocab.insert(tok, i as u32);
+                vocab.insert(tok.clone(), i as u32);
             }
             let mut merges_vec = Vec::new();
             if let Some(merges) = merges {
@@ -72,15 +84,53 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
             }
             let bpe = bpe_builder
                 .vocab_and_merges(vocab, merges_vec)
+                .fuse_unk(true)
                 .build()
                 .map_err(anyhow::Error::msg)?;
             let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe));
-            tokenizer.with_decoder(BPEDecoder::default());
+            tokenizer.with_decoder(Sequence::new(vec![
+                DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?),
+                DecoderWrapper::ByteFallback(ByteFallback::default()),
+                DecoderWrapper::Fuse(Fuse::new()),
+                DecoderWrapper::Strip(Strip::new(' ', 1, 0)),
+            ]));
             if let Some(added_tokens) = added_tokens {
                 for added_token in added_tokens {
                     tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]);
                 }
             }
+            tokenizer.add_special_tokens(&[AddedToken::from(tokens[bos as usize].clone(), true)]);
+            tokenizer.add_special_tokens(&[AddedToken::from(tokens[eos as usize].clone(), true)]);
+            tokenizer.add_special_tokens(&[AddedToken::from(tokens[unk as usize].clone(), true)]);
+
+            tokenizer.with_post_processor(
+                TemplateProcessing::builder()
+                    .special_tokens(Tokens::from(vec![template::SpecialToken::new(
+                        tokens[bos as usize].clone(),
+                        vec![bos],
+                        vec![tokens[bos as usize].clone()],
+                    )
+                    .map_err(anyhow::Error::msg)?]))
+                    .pair(
+                        Template::try_from(vec![
+                            tokens[bos as usize].clone(),
+                            "$A".to_string(),
+                            tokens[bos as usize].clone(),
+                            "$B:1".to_string(),
+                        ])
+                        .unwrap(),
+                    )
+                    .single(
+                        Template::try_from(vec![tokens[bos as usize].clone(), "$A".to_string()])
+                            .unwrap(),
+                    )
+                    .build()?,
+            );
+            tokenizer.with_normalizer(normalizers::Sequence::new(vec![
+                NormalizerWrapper::Prepend(Prepend::new("▁".to_string())),
+                NormalizerWrapper::Replace(Replace::new(" ", "▁").map_err(anyhow::Error::msg)?),
+            ]));
+            info!("Decoder is: {:?}", tokenizer.get_decoder());
             tokenizer
         }
         other => {

From ba44cca98cba1ba53e9370ca1941fee5bda9f617 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 05:19:01 -0400
Subject: [PATCH 05/23] Merge

---
 mistralrs-core/src/pipeline/gguf.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
index 5b1f1ca65d..6e104836c1 100644
--- a/mistralrs-core/src/pipeline/gguf.rs
+++ b/mistralrs-core/src/pipeline/gguf.rs
@@ -482,8 +482,6 @@ impl Loader for GGUFLoader {
             _ => unreachable!(),
         };
 
-        let tokenizer = get_tokenizer(paths.get_tokenizer_filename())?;
-
         let gen_conf: Option<GenerationConfig> = paths
             .get_gen_conf_filename()
             .map(|f| serde_json::from_str(&fs::read_to_string(f).unwrap()).unwrap());

From b276c160189f8e900efd7213acb031b3061f82de Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 07:17:56 -0400
Subject: [PATCH 06/23] Use unigram tokenizer for llama

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 102 ++++--------------
 1 file changed, 23 insertions(+), 79 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 01e7004639..697f0f320b 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -1,14 +1,6 @@
-use std::collections::HashMap;
-
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
-use tokenizers::{
-    decoders::{byte_fallback::ByteFallback, fuse::Fuse, sequence::Sequence, strip::Strip},
-    models::bpe::BpeBuilder,
-    normalizers::{self, Prepend, Replace},
-    processors::template::{self, Template, TemplateProcessing, Tokens},
-    AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
-};
+use tokenizers::{models::unigram::Unigram, ModelWrapper, Tokenizer};
 use tracing::info;
 
 pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
@@ -37,6 +29,14 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
                 })
                 .collect::<Vec<_>>()
         });
+    let scores = content.metadata.get("tokenizer.ggml.scores").map(|items| {
+        items
+            .to_vec()
+            .expect("GGUF tokenizer scores is not a vec.")
+            .iter()
+            .map(|t| t.to_f32().expect("GGUF score is not a f32."))
+            .collect::<Vec<_>>()
+    });
     let merges = content.metadata.get("tokenizer.ggml.merges").map(|items| {
         items
             .to_vec()
@@ -47,91 +47,35 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
     });
 
     info!(
-        "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}",
+        "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}",
         tokens.len(),
         added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
-        merges.as_ref().map(|x| x.len()).unwrap_or(0)
+        merges.as_ref().map(|x| x.len()).unwrap_or(0),
+        scores.as_ref().map(|x| x.len()).unwrap_or(0)
     );
     let unk = content.metadata["tokenizer.ggml.unknown_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
 
-    let eos = content.metadata["tokenizer.ggml.eos_token_id"]
+    let _eos = content.metadata["tokenizer.ggml.eos_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
 
-    let bos = content.metadata["tokenizer.ggml.bos_token_id"]
+    let _bos = content.metadata["tokenizer.ggml.bos_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
 
     let tokenizer = match model.as_str() {
-        "llama" | "replit" | "gpt2" | "rwkv" => {
-            // BPE, as seen in relevant tokenizer.json files
-            let bpe_builder = BpeBuilder::new().unk_token(tokens[unk as usize].clone());
-            info!("Loading as BPE tokenizer.");
-
-            let mut vocab = HashMap::new();
-            for (i, tok) in tokens.iter().enumerate() {
-                #[allow(clippy::cast_possible_truncation)]
-                vocab.insert(tok.clone(), i as u32);
-            }
-            let mut merges_vec = Vec::new();
-            if let Some(merges) = merges {
-                for tok in merges {
-                    let split = tok.splitn(2, ' ').collect::<Vec<_>>();
-                    merges_vec.push((split[0].to_string(), split[1].to_string()));
-                }
+        "llama" => {
+            let scores =
+                scores.expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer.");
+            let mut vocab = Vec::new();
+            for (token, score) in tokens.into_iter().zip(scores) {
+                vocab.push((token, score as f64));
             }
-            let bpe = bpe_builder
-                .vocab_and_merges(vocab, merges_vec)
-                .fuse_unk(true)
-                .build()
-                .map_err(anyhow::Error::msg)?;
-            let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe));
-            tokenizer.with_decoder(Sequence::new(vec![
-                DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?),
-                DecoderWrapper::ByteFallback(ByteFallback::default()),
-                DecoderWrapper::Fuse(Fuse::new()),
-                DecoderWrapper::Strip(Strip::new(' ', 1, 0)),
-            ]));
-            if let Some(added_tokens) = added_tokens {
-                for added_token in added_tokens {
-                    tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]);
-                }
-            }
-            tokenizer.add_special_tokens(&[AddedToken::from(tokens[bos as usize].clone(), true)]);
-            tokenizer.add_special_tokens(&[AddedToken::from(tokens[eos as usize].clone(), true)]);
-            tokenizer.add_special_tokens(&[AddedToken::from(tokens[unk as usize].clone(), true)]);
-
-            tokenizer.with_post_processor(
-                TemplateProcessing::builder()
-                    .special_tokens(Tokens::from(vec![template::SpecialToken::new(
-                        tokens[bos as usize].clone(),
-                        vec![bos],
-                        vec![tokens[bos as usize].clone()],
-                    )
-                    .map_err(anyhow::Error::msg)?]))
-                    .pair(
-                        Template::try_from(vec![
-                            tokens[bos as usize].clone(),
-                            "$A".to_string(),
-                            tokens[bos as usize].clone(),
-                            "$B:1".to_string(),
-                        ])
-                        .unwrap(),
-                    )
-                    .single(
-                        Template::try_from(vec![tokens[bos as usize].clone(), "$A".to_string()])
-                            .unwrap(),
-                    )
-                    .build()?,
-            );
-            tokenizer.with_normalizer(normalizers::Sequence::new(vec![
-                NormalizerWrapper::Prepend(Prepend::new("▁".to_string())),
-                NormalizerWrapper::Replace(Replace::new(" ", "▁").map_err(anyhow::Error::msg)?),
-            ]));
-            info!("Decoder is: {:?}", tokenizer.get_decoder());
-            tokenizer
+            let unigram =
+                Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?;
+            Tokenizer::new(ModelWrapper::Unigram(unigram))
         }
         other => {
             anyhow::bail!("Tokenizer model `{other}` not supported.");

From 1e31df7835279c4a263b932c39aa2d3184fb75cb Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 07:23:01 -0400
Subject: [PATCH 07/23] Logging

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 697f0f320b..783b3fb167 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -81,5 +81,6 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
             anyhow::bail!("Tokenizer model `{other}` not supported.");
         }
     };
+    info!("GGUF tokenizer model is `{model}`: {tokenizer:?}.");
     Ok(tokenizer)
 }

From dd5a855b24dff1783fceba7b3707a603b2421b5f Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 07:39:56 -0400
Subject: [PATCH 08/23] Implement for llama and replit

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 783b3fb167..8e15911f9d 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -1,6 +1,11 @@
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
-use tokenizers::{models::unigram::Unigram, ModelWrapper, Tokenizer};
+use tokenizers::{
+    decoders::{byte_fallback::ByteFallback, sequence::Sequence, strip::Strip},
+    models::unigram::Unigram,
+    normalizers::Replace,
+    DecoderWrapper, ModelWrapper, Tokenizer,
+};
 use tracing::info;
 
 pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
@@ -66,7 +71,8 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         .expect("GGUF unk token is not u32");
 
     let tokenizer = match model.as_str() {
-        "llama" => {
+        "llama" | "replit" => {
+            // unigram
             let scores =
                 scores.expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer.");
             let mut vocab = Vec::new();
@@ -75,7 +81,13 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
             }
             let unigram =
                 Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?;
-            Tokenizer::new(ModelWrapper::Unigram(unigram))
+            let mut tokenizer = Tokenizer::new(ModelWrapper::Unigram(unigram));
+            tokenizer.with_decoder(Sequence::new(vec![
+                DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?),
+                DecoderWrapper::ByteFallback(ByteFallback::new()),
+                DecoderWrapper::Strip(Strip::new(' ', 1, 0)),
+            ]));
+            tokenizer
         }
         other => {
             anyhow::bail!("Tokenizer model `{other}` not supported.");

From d68522cf25376f1e5c7de60bf17d54b549be14dd Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 07:43:15 -0400
Subject: [PATCH 09/23] Better logging

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 8e15911f9d..42fb9a40b5 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -70,7 +70,7 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         .to_u32()
         .expect("GGUF unk token is not u32");
 
-    let tokenizer = match model.as_str() {
+    let (tokenizer, ty) = match model.as_str() {
         "llama" | "replit" => {
             // unigram
             let scores =
@@ -87,12 +87,16 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
                 DecoderWrapper::ByteFallback(ByteFallback::new()),
                 DecoderWrapper::Strip(Strip::new(' ', 1, 0)),
             ]));
-            tokenizer
+            (tokenizer, "unigram")
         }
         other => {
             anyhow::bail!("Tokenizer model `{other}` not supported.");
         }
     };
-    info!("GGUF tokenizer model is `{model}`: {tokenizer:?}.");
+    info!(
+        "GGUF tokenizer model is `{model}`, num vocab: {}, kind: `{}`",
+        tokenizer.get_vocab_size(true),
+        ty
+    );
     Ok(tokenizer)
 }

From d366d2aba0fa5ae1d8c96268480ec71db7d657de Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 07:46:47 -0400
Subject: [PATCH 10/23] Nicer logging

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 42fb9a40b5..47eaef3ec8 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -51,13 +51,6 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
             .collect::<Vec<_>>()
     });
 
-    info!(
-        "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}",
-        tokens.len(),
-        added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
-        merges.as_ref().map(|x| x.len()).unwrap_or(0),
-        scores.as_ref().map(|x| x.len()).unwrap_or(0)
-    );
     let unk = content.metadata["tokenizer.ggml.unknown_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
@@ -73,11 +66,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
     let (tokenizer, ty) = match model.as_str() {
         "llama" | "replit" => {
             // unigram
-            let scores =
-                scores.expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer.");
+            let scores = scores
+                .as_ref()
+                .expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer.");
             let mut vocab = Vec::new();
             for (token, score) in tokens.into_iter().zip(scores) {
-                vocab.push((token, score as f64));
+                vocab.push((token, *score as f64));
             }
             let unigram =
                 Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?;
@@ -94,9 +88,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         }
     };
     info!(
-        "GGUF tokenizer model is `{model}`, num vocab: {}, kind: `{}`",
+        "GGUF tokenizer model is `{model}`, kind: `{}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}",
+        ty,
         tokenizer.get_vocab_size(true),
-        ty
+        added_tokens.as_ref().map(|x| x.len()).unwrap_or(0),
+        merges.as_ref().map(|x| x.len()).unwrap_or(0),
+        scores.as_ref().map(|x| x.len()).unwrap_or(0)
     );
     Ok(tokenizer)
 }

From 3d416a7ffb50434d4ac2bb2b85c07315c4d56f6a Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 07:47:30 -0400
Subject: [PATCH 11/23] Update for verbose mode

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 47eaef3ec8..b693bb1544 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -1,3 +1,5 @@
+use std::sync::atomic::Ordering;
+
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
 use tokenizers::{
@@ -8,6 +10,8 @@ use tokenizers::{
 };
 use tracing::info;
 
+use crate::DEBUG;
+
 pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
     let model = content.metadata["tokenizer.ggml.model"]
         .to_string()
@@ -95,5 +99,8 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         merges.as_ref().map(|x| x.len()).unwrap_or(0),
         scores.as_ref().map(|x| x.len()).unwrap_or(0)
     );
+    if DEBUG.load(Ordering::Relaxed) {
+        info!("Tokenizer: {tokenizer:?}");
+    }
     Ok(tokenizer)
 }

From 19cf0288392aa8e2587997f6a62afc3aaf045ae3 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 08:34:02 -0400
Subject: [PATCH 12/23] Allow fully local loading for gguf

---
 mistralrs-core/src/model_loader.rs    |  6 --
 mistralrs-core/src/model_selected.rs  | 24 +++-----
 mistralrs-core/src/pipeline/gguf.rs   | 18 +++---
 mistralrs-core/src/pipeline/macros.rs | 83 +++++++++++++++++++++++++++
 mistralrs-core/src/pipeline/mod.rs    | 16 +++++-
 mistralrs-core/src/toml_selector.rs   | 15 +++--
 mistralrs-pyo3/API.md                 | 12 +++-
 mistralrs-pyo3/mistralrs.pyi          |  3 -
 mistralrs-pyo3/src/lib.rs             |  6 --
 mistralrs-pyo3/src/which.rs           |  3 -
 mistralrs/examples/quantized/main.rs  |  1 -
 11 files changed, 131 insertions(+), 56 deletions(-)

diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
index b7438b0f08..3ab381ad97 100644
--- a/mistralrs-core/src/model_loader.rs
+++ b/mistralrs-core/src/model_loader.rs
@@ -150,14 +150,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(arch),
         ModelSelected::GGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             Some(tok_model_id),
             quantized_model_id,
             quantized_filename,
@@ -165,7 +163,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(),
         ModelSelected::XLoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -175,7 +172,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
@@ -192,7 +188,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         .build(),
         ModelSelected::LoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -201,7 +196,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
index 6642c3f8fc..1bf68939d6 100644
--- a/mistralrs-core/src/model_selected.rs
+++ b/mistralrs-core/src/model_selected.rs
@@ -95,14 +95,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model.
     GGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: String,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]
@@ -119,14 +117,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model with X-LoRA.
     XLoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: Option<String>,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]
@@ -156,14 +152,12 @@ pub enum ModelSelected {
 
     /// Select a GGUF model with LoRA.
     LoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         #[arg(short, long)]
         tok_model_id: Option<String>,
 
-        /// Path to local tokenizer.json file. If this is specified it is used over any remote file.
-        #[arg(long)]
-        tokenizer_json: Option<String>,
-
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
         #[arg(short = 'm', long)]
diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
index 6e104836c1..ae3bb9dcaa 100644
--- a/mistralrs-core/src/pipeline/gguf.rs
+++ b/mistralrs-core/src/pipeline/gguf.rs
@@ -14,7 +14,7 @@ use crate::prefix_cacher::PrefixCacheManager;
 use crate::sequence::Sequence;
 use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters};
 use crate::xlora_models::NonGranularState;
-use crate::{do_sample, get_mut_arcmutex, get_paths, DeviceMapMetadata, DEBUG};
+use crate::{do_sample, get_mut_arcmutex, get_paths_gguf, DeviceMapMetadata, DEBUG};
 use crate::{
     models::quantized_llama::ModelWeights as QLlama,
     models::quantized_phi2::ModelWeights as QPhi,
@@ -69,7 +69,6 @@ pub struct GGUFLoader {
     xlora_order: Option<Ordering>,
     no_kv_cache: bool,
     chat_template: Option<String>,
-    tokenizer_json: Option<String>,
     kind: ModelKind,
     tgt_non_granular_index: Option<usize>,
 }
@@ -119,24 +118,24 @@ pub struct GGUFLoaderBuilder {
     xlora_order: Option<Ordering>,
     no_kv_cache: bool,
     chat_template: Option<String>,
-    tokenizer_json: Option<String>,
     tgt_non_granular_index: Option<usize>,
 }
 
 impl GGUFLoaderBuilder {
+    /// Create a loader builder for a GGUF model. `tok_model_id` is the model ID where you can find a
+    /// `tokenizer_config.json` file. If the `chat_template` is specified, then it will be treated as a
+    /// path and used over remote files, removing all remote accesses.
     pub fn new(
         config: GGUFSpecificConfig,
         chat_template: Option<String>,
-        tokenizer_json: Option<String>,
-        model_id: Option<String>,
+        tok_model_id: Option<String>,
         quantized_model_id: String,
         quantized_filename: String,
     ) -> Self {
         Self {
             config,
             chat_template,
-            tokenizer_json,
-            model_id,
+            model_id: tok_model_id,
             kind: ModelKind::QuantizedGGUF,
             quantized_filename,
             quantized_model_id,
@@ -197,7 +196,6 @@ impl GGUFLoaderBuilder {
             xlora_order: self.xlora_order,
             no_kv_cache: self.no_kv_cache,
             chat_template: self.chat_template,
-            tokenizer_json: self.tokenizer_json,
             tgt_non_granular_index: self.tgt_non_granular_index,
             quantized_filename: Some(self.quantized_filename),
             quantized_model_id: Some(self.quantized_model_id),
@@ -217,7 +215,6 @@ impl GGUFLoader {
         xlora_order: Option<Ordering>,
         no_kv_cache: bool,
         chat_template: Option<String>,
-        tokenizer_json: Option<String>,
         tgt_non_granular_index: Option<usize>,
     ) -> Self {
         let model_id = if let Some(id) = model_id {
@@ -238,7 +235,6 @@ impl GGUFLoader {
             xlora_order,
             no_kv_cache,
             chat_template,
-            tokenizer_json,
             kind,
             tgt_non_granular_index,
         }
@@ -279,7 +275,7 @@ impl Loader for GGUFLoader {
         mapper: DeviceMapMetadata,
         in_situ_quant: Option<GgmlDType>,
     ) -> Result<Arc<Mutex<dyn Pipeline + Send + Sync>>> {
-        let paths: anyhow::Result<Box<dyn ModelPaths>> = get_paths!(
+        let paths: anyhow::Result<Box<dyn ModelPaths>> = get_paths_gguf!(
             LocalModelPaths,
             &token_source,
             revision,
diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs
index 25068ccad1..7f8f663d59 100644
--- a/mistralrs-core/src/pipeline/macros.rs
+++ b/mistralrs-core/src/pipeline/macros.rs
@@ -138,6 +138,89 @@ macro_rules! get_paths {
     }};
 }
 
+#[macro_export]
+macro_rules! get_paths_gguf {
+    ($path_name:ident, $token_source:expr, $revision:expr, $this:expr, $quantized_model_id:expr, $quantized_filename:expr, $silent:expr) => {{
+        let api = ApiBuilder::new()
+            .with_progress(!$silent)
+            .with_token(get_token($token_source)?)
+            .build()?;
+        let revision = $revision.unwrap_or("main".to_string());
+        let api = api.repo(Repo::with_revision(
+            $this.model_id.clone(),
+            RepoType::Model,
+            revision.clone(),
+        ));
+        let model_id = std::path::Path::new(&$this.model_id);
+
+        let chat_template = if let Some(ref p) = $this.chat_template {
+            if p.ends_with(".json") {
+                info!("Using chat template file at `{p}`");
+                PathBuf::from_str(p)?
+            } else {
+                PathBuf::from_str("")?
+            }
+        } else {
+            $crate::api_get_file!(
+                api,
+                "tokenizer_config.json",
+                model_id
+            ) // Will be loaded from inside gguf file
+        };
+
+        let filenames = get_model_paths(
+            revision.clone(),
+            &$token_source,
+            &$quantized_model_id,
+            &$quantized_filename,
+            &api,
+            &model_id,
+        )?;
+
+        let XLoraPaths {
+            adapter_configs,
+            adapter_safetensors,
+            classifier_path,
+            xlora_order,
+            xlora_config,
+            lora_preload_adapter_info,
+        } = get_xlora_paths(
+            $this.model_id.clone(),
+            &$this.xlora_model_id,
+            &$token_source,
+            revision.clone(),
+            &$this.xlora_order,
+        )?;
+
+        let gen_conf = if $crate::api_dir_list!(api, model_id)
+            .collect::<Vec<_>>()
+            .contains(&"generation_config.json".to_string())
+        {
+            Some($crate::api_get_file!(
+                api,
+                "generation_config.json",
+                model_id
+            ))
+        } else {
+            None
+        };
+
+        Ok(Box::new($path_name {
+            tokenizer_filename: PathBuf::from_str("")?,
+            config_filename: PathBuf::from_str("")?,
+            filenames,
+            xlora_adapter_configs: adapter_configs,
+            xlora_adapter_filenames: adapter_safetensors,
+            classifier_path,
+            classifier_config: xlora_config,
+            xlora_ordering: xlora_order,
+            template_filename: chat_template,
+            gen_conf,
+            lora_preload_adapter_info,
+        }))
+    }};
+}
+
 #[macro_export]
 macro_rules! normal_model_loader {
     ($paths:expr, $dtype:expr, $default_dtype:expr, $device:expr, $config:expr, $loader:expr, $use_flash_attn:expr, $silent:expr, $mapper:expr, $loading_isq:expr, $real_device:expr) => {{
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index 68b94bb089..d06b91af77 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -1298,8 +1298,20 @@ pub(crate) fn get_chat_template(
     paths: &Box<dyn ModelPaths>,
     chat_template: &Option<String>,
 ) -> ChatTemplate {
+    let template_filename = if paths.get_template_filename().to_string_lossy().is_empty() {
+        PathBuf::from(
+            chat_template
+                .as_ref()
+                .expect("A tokenizer config or chat template file path must be specified."),
+        )
+    } else {
+        paths.get_template_filename().clone()
+    };
+    if !template_filename.ends_with(".json") {
+        panic!("Template filename {template_filename:?} must end with `.json`.");
+    }
     let template: ChatTemplate =
-        serde_json::from_str(&fs::read_to_string(paths.get_template_filename()).unwrap()).unwrap();
+        serde_json::from_str(&fs::read_to_string(&template_filename).unwrap()).unwrap();
 
     #[derive(Debug, serde::Deserialize)]
     struct SpecifiedTemplate {
@@ -1314,7 +1326,7 @@ pub(crate) fn get_chat_template(
 
     info!("`tokenizer_config.json` does not contain a chat template, attempting to use specified JINJA chat template.");
     let mut deser: HashMap<String, Value> =
-        serde_json::from_str(&fs::read_to_string(paths.get_template_filename()).unwrap()).unwrap();
+        serde_json::from_str(&fs::read_to_string(&template_filename).unwrap()).unwrap();
 
     match chat_template.clone() {
         Some(t) => {
diff --git a/mistralrs-core/src/toml_selector.rs b/mistralrs-core/src/toml_selector.rs
index 5bf67276ca..478d940eb0 100644
--- a/mistralrs-core/src/toml_selector.rs
+++ b/mistralrs-core/src/toml_selector.rs
@@ -65,7 +65,9 @@ enum TomlModelSelected {
     /// Select a GGUF model.
     #[allow(clippy::upper_case_acronyms)]
     GGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         tok_model_id: String,
 
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
@@ -78,7 +80,9 @@ enum TomlModelSelected {
 
     /// Select a GGUF model with X-LoRA.
     XLoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         tok_model_id: Option<String>,
 
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
@@ -101,7 +105,9 @@ enum TomlModelSelected {
 
     /// Select a GGUF model with LoRA.
     LoraGGUF {
-        /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path.
+        /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file.
+        /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
+        /// removing all remote accesses.
         tok_model_id: Option<String>,
 
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
@@ -299,7 +305,6 @@ fn loader_from_selected(
                 repeat_last_n: args.repeat_last_n,
             },
             args.chat_template,
-            args.tokenizer_json,
             Some(tok_model_id),
             quantized_model_id,
             quantized_filename,
@@ -317,7 +322,6 @@ fn loader_from_selected(
                 repeat_last_n: args.repeat_last_n,
             },
             args.chat_template,
-            args.tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
@@ -343,7 +347,6 @@ fn loader_from_selected(
                 repeat_last_n: args.repeat_last_n,
             },
             args.chat_template,
-            args.tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md
index 7d03873482..359ac00e80 100644
--- a/mistralrs-pyo3/API.md
+++ b/mistralrs-pyo3/API.md
@@ -22,11 +22,13 @@ Additionally, for models without quantization, the model architecture should be
 
 ```py
 class Which(Enum):
+    @dataclass
     class Plain:
         model_id: str
         arch: Architecture
         tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class XLora:
         arch: Architecture
         xlora_model_id: str
@@ -35,6 +37,7 @@ class Which(Enum):
         model_id: str | None = None
         tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class Lora:
         arch: Architecture
         adapters_model_id: str
@@ -42,12 +45,13 @@ class Which(Enum):
         model_id: str | None = None
         tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class GGUF:
         tok_model_id: str
         quantized_model_id: str
         quantized_filename: str
-        tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class XLoraGGUF:
         tok_model_id: str
         quantized_model_id: str
@@ -55,22 +59,23 @@ class Which(Enum):
         xlora_model_id: str
         order: str
         tgt_non_granular_index: int | None = None
-        tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class LoraGGUF:
         tok_model_id: str
         quantized_model_id: str
         quantized_filename: str
         adapters_model_id: str
         order: str
-        tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class GGML:
         tok_model_id: str
         quantized_model_id: str
         quantized_filename: str
         tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class XLoraGGML:
         tok_model_id: str
         quantized_model_id: str
@@ -80,6 +85,7 @@ class Which(Enum):
         tgt_non_granular_index: int | None = None
         tokenizer_json: str | None = None
         repeat_last_n: int = 64
+    @dataclass
     class LoraGGML:
         tok_model_id: str
         quantized_model_id: str
diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi
index a1239a557d..f1d7c46c7c 100644
--- a/mistralrs-pyo3/mistralrs.pyi
+++ b/mistralrs-pyo3/mistralrs.pyi
@@ -96,7 +96,6 @@ class Which(Enum):
         tok_model_id: str
         quantized_model_id: str
         quantized_filename: str
-        tokenizer_json: str | None = None
         repeat_last_n: int = 64
     @dataclass
     class XLoraGGUF:
@@ -106,7 +105,6 @@ class Which(Enum):
         xlora_model_id: str
         order: str
         tgt_non_granular_index: int | None = None
-        tokenizer_json: str | None = None
         repeat_last_n: int = 64
     @dataclass
     class LoraGGUF:
@@ -115,7 +113,6 @@ class Which(Enum):
         quantized_filename: str
         adapters_model_id: str
         order: str
-        tokenizer_json: str | None = None
         repeat_last_n: int = 64
     @dataclass
     class GGML:
diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs
index b1e5f8a832..aa61d5d7fe 100644
--- a/mistralrs-pyo3/src/lib.rs
+++ b/mistralrs-pyo3/src/lib.rs
@@ -167,7 +167,6 @@ fn parse_which(
         .build(arch.into()),
         Which::GGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -176,7 +175,6 @@ fn parse_which(
                 repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT),
             },
             chat_template,
-            tokenizer_json,
             Some(tok_model_id),
             quantized_model_id,
             quantized_filename,
@@ -184,7 +182,6 @@ fn parse_which(
         .build(),
         Which::XLoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -196,7 +193,6 @@ fn parse_which(
                 repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT),
             },
             chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
@@ -214,7 +210,6 @@ fn parse_which(
         .build(),
         Which::LoraGGUF {
             tok_model_id,
-            tokenizer_json,
             quantized_model_id,
             quantized_filename,
             repeat_last_n,
@@ -225,7 +220,6 @@ fn parse_which(
                 repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT),
             },
             chat_template,
-            tokenizer_json,
             tok_model_id,
             quantized_model_id,
             quantized_filename,
diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs
index f7def2cfb9..98bce20d87 100644
--- a/mistralrs-pyo3/src/which.rs
+++ b/mistralrs-pyo3/src/which.rs
@@ -57,7 +57,6 @@ pub enum Which {
     #[allow(clippy::upper_case_acronyms)]
     GGUF {
         tok_model_id: String,
-        tokenizer_json: Option<String>,
         quantized_model_id: String,
         quantized_filename: String,
         repeat_last_n: Option<usize>,
@@ -65,7 +64,6 @@ pub enum Which {
 
     XLoraGGUF {
         tok_model_id: Option<String>,
-        tokenizer_json: Option<String>,
         quantized_model_id: String,
         quantized_filename: String,
         repeat_last_n: Option<usize>,
@@ -76,7 +74,6 @@ pub enum Which {
 
     LoraGGUF {
         tok_model_id: Option<String>,
-        tokenizer_json: Option<String>,
         quantized_model_id: String,
         quantized_filename: String,
         repeat_last_n: Option<usize>,
diff --git a/mistralrs/examples/quantized/main.rs b/mistralrs/examples/quantized/main.rs
index 37f60ef01d..58f1ac92b4 100644
--- a/mistralrs/examples/quantized/main.rs
+++ b/mistralrs/examples/quantized/main.rs
@@ -12,7 +12,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> {
     let loader = GGUFLoaderBuilder::new(
         GGUFSpecificConfig { repeat_last_n: 64 },
         None,
-        None,
         Some("mistralai/Mistral-7B-Instruct-v0.1".to_string()),
         "TheBloke/Mistral-7B-Instruct-v0.1-GGUF".to_string(),
         "mistral-7b-instruct-v0.1.Q4_K_M.gguf".to_string(),

From d8831239b62d93597e234a2ef9ab2e9c6ebd5ab0 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 08:38:28 -0400
Subject: [PATCH 13/23] Update docs for loading

---
 README.md                            | 10 ++++++----
 mistralrs-core/src/model_loader.rs   |  2 +-
 mistralrs-core/src/model_selected.rs |  2 +-
 mistralrs-pyo3/src/lib.rs            |  2 +-
 mistralrs-pyo3/src/which.rs          |  2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 256e5d369c..6bc804e6b0 100644
--- a/README.md
+++ b/README.md
@@ -240,16 +240,18 @@ This is passed in the following ways:
 
 If token cannot be loaded, no token will be used (i.e. effectively using `none`).
 
-## Loading models from local files:**
+## Loading models from local files:
 
-You can also instruct mistral.rs to load models locally by modifying the `*_model_id` arguments or options:
+You can also instruct mistral.rs to load models fully locally by modifying the `*_model_id` arguments or options:
 ```bash
 ./mistralrs_server --port 1234 plain -m . -a mistral
 ```
-or
+
+To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the
+chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example:
 
 ```bash
-./mistralrs-server gguf -m . -t . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
+./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
 ```
 
 Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option:
diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs
index 3ab381ad97..3d61eb62cc 100644
--- a/mistralrs-core/src/model_loader.rs
+++ b/mistralrs-core/src/model_loader.rs
@@ -156,7 +156,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa
         } => GGUFLoaderBuilder::new(
             GGUFSpecificConfig { repeat_last_n },
             args.chat_template,
-            Some(tok_model_id),
+            tok_model_id,
             quantized_model_id,
             quantized_filename,
         )
diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs
index 1bf68939d6..a9ed08e5d0 100644
--- a/mistralrs-core/src/model_selected.rs
+++ b/mistralrs-core/src/model_selected.rs
@@ -99,7 +99,7 @@ pub enum ModelSelected {
         /// If the `chat_template` is specified, then it will be treated as a path and used over remote files,
         /// removing all remote accesses.
         #[arg(short, long)]
-        tok_model_id: String,
+        tok_model_id: Option<String>,
 
         /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set.
         /// This may be a HF hub repo or a local path.
diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs
index aa61d5d7fe..ae0ec9d3b6 100644
--- a/mistralrs-pyo3/src/lib.rs
+++ b/mistralrs-pyo3/src/lib.rs
@@ -175,7 +175,7 @@ fn parse_which(
                 repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT),
             },
             chat_template,
-            Some(tok_model_id),
+            tok_model_id,
             quantized_model_id,
             quantized_filename,
         )
diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs
index 98bce20d87..a5a33a6123 100644
--- a/mistralrs-pyo3/src/which.rs
+++ b/mistralrs-pyo3/src/which.rs
@@ -56,7 +56,7 @@ pub enum Which {
 
     #[allow(clippy::upper_case_acronyms)]
     GGUF {
-        tok_model_id: String,
+        tok_model_id: Option<String>,
         quantized_model_id: String,
         quantized_filename: String,
         repeat_last_n: Option<usize>,

From bf308d476e13e3ca5f80e9dcf0af38f7c0cca905 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 12:46:30 -0400
Subject: [PATCH 14/23] Fix extension checking

---
 mistralrs-core/src/pipeline/mod.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index d06b91af77..9d7dbee835 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -1307,7 +1307,12 @@ pub(crate) fn get_chat_template(
     } else {
         paths.get_template_filename().clone()
     };
-    if !template_filename.ends_with(".json") {
+    if template_filename
+        .extension()
+        .expect("Template filename must be a file")
+        .to_string_lossy()
+        != "json"
+    {
         panic!("Template filename {template_filename:?} must end with `.json`.");
     }
     let template: ChatTemplate =

From ec4ccb9ac31e8240e301aaf0321566b80555425f Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 14:07:46 -0400
Subject: [PATCH 15/23] Add some tests

---
 mistralrs-core/Cargo.toml                     |   1 +
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 165 +++++++++++++++++-
 mistralrs-core/src/sampler.rs                 |   6 +-
 3 files changed, 159 insertions(+), 13 deletions(-)

diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml
index 2ba0475874..9d4c9c1999 100644
--- a/mistralrs-core/Cargo.toml
+++ b/mistralrs-core/Cargo.toml
@@ -56,6 +56,7 @@ toml = "0.8.12"
 strum = { version = "0.26", features = ["derive"] }
 derive_more = { version = "0.99.17", default-features = false, features = ["from"] }
 tracing-subscriber.workspace = true
+reqwest = { version = "0.12.4", features = ["blocking"] }
 
 [features]
 pyo3_macros = ["pyo3"]
diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index b693bb1544..3bb97fa9bf 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -3,10 +3,10 @@ use std::sync::atomic::Ordering;
 use anyhow::Result;
 use candle_core::quantized::gguf_file::Content;
 use tokenizers::{
-    decoders::{byte_fallback::ByteFallback, sequence::Sequence, strip::Strip},
+    decoders::{self, byte_fallback::ByteFallback, fuse::Fuse, strip::Strip},
     models::unigram::Unigram,
-    normalizers::Replace,
-    DecoderWrapper, ModelWrapper, Tokenizer,
+    normalizers::{self, Prepend, Replace},
+    AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer,
 };
 use tracing::info;
 
@@ -59,11 +59,11 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         .to_u32()
         .expect("GGUF unk token is not u32");
 
-    let _eos = content.metadata["tokenizer.ggml.eos_token_id"]
+    let eos = content.metadata["tokenizer.ggml.eos_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
 
-    let _bos = content.metadata["tokenizer.ggml.bos_token_id"]
+    let bos = content.metadata["tokenizer.ggml.bos_token_id"]
         .to_u32()
         .expect("GGUF unk token is not u32");
 
@@ -74,17 +74,27 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
                 .as_ref()
                 .expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer.");
             let mut vocab = Vec::new();
-            for (token, score) in tokens.into_iter().zip(scores) {
-                vocab.push((token, *score as f64));
+            for (token, score) in tokens.iter().zip(scores) {
+                vocab.push((token.clone(), *score as f64));
             }
             let unigram =
                 Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?;
             let mut tokenizer = Tokenizer::new(ModelWrapper::Unigram(unigram));
-            tokenizer.with_decoder(Sequence::new(vec![
+            tokenizer.with_decoder(decoders::sequence::Sequence::new(vec![
                 DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?),
                 DecoderWrapper::ByteFallback(ByteFallback::new()),
+                DecoderWrapper::Fuse(Fuse::new()),
                 DecoderWrapper::Strip(Strip::new(' ', 1, 0)),
             ]));
+            tokenizer.with_normalizer(normalizers::Sequence::new(vec![
+                NormalizerWrapper::Prepend(Prepend::new("▁".to_string())),
+                NormalizerWrapper::Replace(Replace::new(" ", "▁").map_err(anyhow::Error::msg)?),
+            ]));
+
+            tokenizer.add_special_tokens(&[AddedToken::from(tokens[bos as usize].clone(), true)]);
+            tokenizer.add_special_tokens(&[AddedToken::from(tokens[eos as usize].clone(), true)]);
+            tokenizer.add_special_tokens(&[AddedToken::from(tokens[unk as usize].clone(), true)]);
+
             (tokenizer, "unigram")
         }
         other => {
@@ -104,3 +114,142 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
     }
     Ok(tokenizer)
 }
+
+mod tests {
+    use anyhow::Result;
+    use candle_core::quantized::gguf_file::Content;
+    use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
+    use tokenizers::Tokenizer;
+
+    use super::convert_ggml_to_hf_tokenizer;
+
+    #[allow(dead_code)]
+    #[derive(Debug)]
+    enum TokenizerType {
+        /// Mistral v0.1 tokenizer
+        Llama,
+        Replit,
+        Gpt2,
+        Rwkv,
+    }
+
+    #[allow(dead_code)]
+    fn get_gguf_tokenizer(tokenizer: TokenizerType) -> Result<Tokenizer> {
+        match tokenizer {
+            TokenizerType::Llama => {
+                let api = ApiBuilder::new().with_progress(true).build().unwrap();
+                let api = api.repo(Repo::with_revision(
+                    "TheBloke/Mistral-7B-Instruct-v0.1-GGUF".to_string(),
+                    RepoType::Model,
+                    "main".to_string(),
+                ));
+
+                let filename = api.get("mistral-7b-instruct-v0.1.Q2_K.gguf").unwrap();
+                let mut file = std::fs::File::open(&filename)?;
+                convert_ggml_to_hf_tokenizer(
+                    &Content::read(&mut file)
+                        .map_err(|e| e.with_path(filename))
+                        .map_err(anyhow::Error::msg)?,
+                )
+                .map_err(anyhow::Error::msg)
+            }
+            other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"),
+        }
+    }
+
+    #[allow(dead_code)]
+    fn get_hf_tokenizer(tokenizer: TokenizerType) -> Result<Tokenizer> {
+        match tokenizer {
+            TokenizerType::Llama => {
+                let api = ApiBuilder::new().with_progress(true).build().unwrap();
+                let api = api.repo(Repo::with_revision(
+                    "EricB/mistralrs_tests".to_string(),
+                    RepoType::Model,
+                    "main".to_string(),
+                ));
+
+                let tokenizer_filename = api.get("tokenizer.json").unwrap();
+                Ok(Tokenizer::from_file(tokenizer_filename).unwrap())
+            }
+            other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"),
+        }
+    }
+
+    #[allow(dead_code)]
+    fn get_test_passage() -> String {
+        let passage = reqwest::blocking::get("https://loripsum.net/api")
+            .expect("Failed to download sample text")
+            .bytes()
+            .expect("Failed to get bytes");
+        String::from_utf8(passage.to_vec()).expect("Failed to convert sample text to string.")
+    }
+
+    #[test]
+    fn test_encode_llama() -> Result<()> {
+        let passage = get_test_passage();
+        let hf_tokenizer = get_hf_tokenizer(TokenizerType::Llama)?;
+        let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Llama)?;
+
+        // Without special tokens
+        let hf_tokenized = hf_tokenizer
+            .encode(passage.as_str(), false)
+            .map_err(anyhow::Error::msg)?;
+        let gguf_tokenized = gguf_tokenizer
+            .encode(passage.as_str(), false)
+            .map_err(anyhow::Error::msg)?;
+        let hf_decoded = hf_tokenizer
+            .decode(hf_tokenized.get_ids(), false)
+            .map_err(anyhow::Error::msg)?;
+        let gguf_decoded = gguf_tokenizer
+            .decode(gguf_tokenized.get_ids(), false)
+            .map_err(anyhow::Error::msg)?;
+        assert_eq!(hf_decoded, gguf_decoded);
+
+        // With special tokens
+        let hf_tokenized = hf_tokenizer
+            .encode(passage.as_str(), true)
+            .map_err(anyhow::Error::msg)?;
+        let gguf_tokenized = gguf_tokenizer
+            .encode(passage.as_str(), true)
+            .map_err(anyhow::Error::msg)?;
+        let hf_decoded = hf_tokenizer
+            .decode(hf_tokenized.get_ids(), true)
+            .map_err(anyhow::Error::msg)?;
+        let gguf_decoded = gguf_tokenizer
+            .decode(gguf_tokenized.get_ids(), true)
+            .map_err(anyhow::Error::msg)?;
+        assert_eq!(hf_decoded, gguf_decoded);
+        Ok(())
+    }
+
+    #[test]
+    fn test_decode() -> Result<()> {
+        use rand::seq::SliceRandom;
+        use rand::thread_rng;
+
+        let hf_tokenizer = get_hf_tokenizer(TokenizerType::Llama)?;
+        let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Llama)?;
+
+        let mut tokens = (0..hf_tokenizer.get_vocab_size(false) as u32).collect::<Vec<_>>();
+        tokens.shuffle(&mut thread_rng());
+
+        // Without skipping special tokens
+        let hf_decoded = hf_tokenizer
+            .decode(&tokens, false)
+            .map_err(anyhow::Error::msg)?;
+        let gguf_decoded = gguf_tokenizer
+            .decode(&tokens, false)
+            .map_err(anyhow::Error::msg)?;
+        assert_eq!(hf_decoded, gguf_decoded);
+
+        // With skipping special tokens
+        let hf_decoded = hf_tokenizer
+            .decode(&tokens, true)
+            .map_err(anyhow::Error::msg)?;
+        let gguf_decoded = gguf_tokenizer
+            .decode(&tokens, true)
+            .map_err(anyhow::Error::msg)?;
+        assert_eq!(hf_decoded, gguf_decoded);
+        Ok(())
+    }
+}
diff --git a/mistralrs-core/src/sampler.rs b/mistralrs-core/src/sampler.rs
index 520b139f0c..a8da56c100 100644
--- a/mistralrs-core/src/sampler.rs
+++ b/mistralrs-core/src/sampler.rs
@@ -413,11 +413,7 @@ mod tests {
 
     #[allow(dead_code)]
     fn get_tokenizer() -> Tokenizer {
-        let api = ApiBuilder::new()
-            .with_progress(true)
-            .with_token(Some(std::env::var("TESTS_HF_TOKEN").unwrap()))
-            .build()
-            .unwrap();
+        let api = ApiBuilder::new().with_progress(true).build().unwrap();
         let api = api.repo(Repo::with_revision(
             "EricB/mistralrs_tests".to_string(),
             RepoType::Model,

From e0551d3fdca1704fbdf6372565666af3e06b1427 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 14:21:06 -0400
Subject: [PATCH 16/23] Update test

---
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 3bb97fa9bf..5d6e644985 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -230,6 +230,7 @@ mod tests {
         let hf_tokenizer = get_hf_tokenizer(TokenizerType::Llama)?;
         let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Llama)?;
 
+        #[allow(clippy::cast_possible_truncation)]
         let mut tokens = (0..hf_tokenizer.get_vocab_size(false) as u32).collect::<Vec<_>>();
         tokens.shuffle(&mut thread_rng());
 

From 6c832d15dc70202736b88e060108080f79bce972 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 14:27:25 -0400
Subject: [PATCH 17/23] Update docs

---
 README.md                                     | 22 +++++++++++++------
 mistralrs-core/src/pipeline/gguf_tokenizer.rs |  2 +-
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 6bc804e6b0..55edf13427 100644
--- a/README.md
+++ b/README.md
@@ -247,13 +247,6 @@ You can also instruct mistral.rs to load models fully locally by modifying the `
 ./mistralrs_server --port 1234 plain -m . -a mistral
 ```
 
-To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the
-chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example:
-
-```bash
-./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
-```
-
 Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option:
 - `--model-id` (server) or `model_id` (python/rust) or `--tok-model-id` (server) or `tok_model_id` (python/rust): 
   - `config.json`
@@ -269,6 +262,21 @@ Throughout mistral.rs, any model ID argument or option may be a local path and s
 - `--adapters-model-id` (server) or `adapters_model_id` (python/rust):
   - Adapters `.safetensors` and `adapter_config.json` files in their respective directories
 
+## Running GGUF models locally
+
+To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the
+chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example:
+
+```bash
+./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
+```
+
+The following tokenizer model types are currently supported. If you would like one to be added, please raise an issue. Otherwise,
+please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face.
+
+**Supported GGUF tokenizer types**
+- `llama`
+
 ### Run
 
 To start a server serving Mistral GGUF on `localhost:1234`, 
diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 5d6e644985..1a8333616d 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -223,7 +223,7 @@ mod tests {
     }
 
     #[test]
-    fn test_decode() -> Result<()> {
+    fn test_decode_llama() -> Result<()> {
         use rand::seq::SliceRandom;
         use rand::thread_rng;
 

From 30055fff8aacbc402284482b391b995ce04c61e0 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 14:38:52 -0400
Subject: [PATCH 18/23] Update readme

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 55edf13427..a75eb5279e 100644
--- a/README.md
+++ b/README.md
@@ -155,7 +155,7 @@ Please submit more benchmarks via raising an issue!
 
 ## Usage
 ### Installation and Build
-To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` when using the server to enable automatic download of gated models.
+To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` by running `huggingface-cli login` to enable automatic download of gated models.
 
 1) Install required packages
     - `openssl` (ex., `sudo apt install libssl-dev`)
@@ -169,9 +169,7 @@ To install mistral.rs, one should ensure they have Rust installed by following [
 
 3) Set HF token correctly (skip if already set or your model is not gated, or if you want to use the `token_source` parameters in Python or the command line.)
     ```bash
-    mkdir ~/.cache/huggingface
-    touch ~/.cache/huggingface/token
-    echo <HF_TOKEN_HERE> > ~/.cache/huggingface/token
+    huggingface-cli login
     ```
 
 4) Download the code
@@ -220,6 +218,7 @@ To install mistral.rs, one should ensure they have Rust installed by following [
 
     You can install Python support by following the guide [here](mistralrs-pyo3/README.md).
 
+## Getting models
 ### Getting models from HF Hub
 
 Mistral.rs can automatically download models from HF Hub. To access gated models, you should provide a token source. They may be one of:
@@ -240,7 +239,7 @@ This is passed in the following ways:
 
 If token cannot be loaded, no token will be used (i.e. effectively using `none`).
 
-## Loading models from local files:
+### Loading models from local files:
 
 You can also instruct mistral.rs to load models fully locally by modifying the `*_model_id` arguments or options:
 ```bash
@@ -262,10 +261,10 @@ Throughout mistral.rs, any model ID argument or option may be a local path and s
 - `--adapters-model-id` (server) or `adapters_model_id` (python/rust):
   - Adapters `.safetensors` and `adapter_config.json` files in their respective directories
 
-## Running GGUF models locally
+### Running GGUF models locally
 
 To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the
-chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example:
+chat template JSON file (examples [here](chat_templates), you will need to create your own by specifying the chat template and `bos`/`eos` tokens) as well as specifying a local model ID. For example:
 
 ```bash
 ./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf
@@ -277,7 +276,7 @@ please consider using the method demonstrated in examples below, where the token
 **Supported GGUF tokenizer types**
 - `llama`
 
-### Run
+## Run
 
 To start a server serving Mistral GGUF on `localhost:1234`, 
 ```bash

From c374297fa6f087d0ef3b4cee94c3a95504843fa6 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 14:41:48 -0400
Subject: [PATCH 19/23] Update readme

---
 README.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a75eb5279e..80c2b16c5b 100644
--- a/README.md
+++ b/README.md
@@ -219,7 +219,12 @@ To install mistral.rs, one should ensure they have Rust installed by following [
     You can install Python support by following the guide [here](mistralrs-pyo3/README.md).
 
 ## Getting models
-### Getting models from HF Hub
+
+There are 2 ways to run a model with mistral.rs:
+- From Hugging Face Hub (easiest)
+- From local files
+
+### Getting models from Hugging Face Hub
 
 Mistral.rs can automatically download models from HF Hub. To access gated models, you should provide a token source. They may be one of:
 - `literal:<value>`: Load from a specified literal
@@ -299,7 +304,7 @@ Additionally, for models without quantization, the model architecture should be
 You can launch interactive mode, a simple chat application running in the terminal, by passing `-i`:
 
 ```bash
-./mistralrs_server -i gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf
+./mistralrs_server -i plain -m microsoft/Phi-3-mini-128k-instruct -a phi3
 ```
 
 ### Quick examples:
@@ -342,7 +347,7 @@ To start a server running Llama from GGML:
 To start a server running Mistral from safetensors.
 
 ```bash
-./mistralrs_server --port 1234 gguf -m mistralai/Mistral-7B-Instruct-v0.1
+./mistralrs_server --port 1234 plain -m mistralai/Mistral-7B-Instruct-v0.1 -a mistral
 ```
 
 ### Structured selection with a `.toml` file

From 71bdd2f1bff086e0fa850caf4fcc0278b528ba29 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 15:24:25 -0400
Subject: [PATCH 20/23] Bump version

---
 Cargo.toml                             | 2 +-
 mistralrs-bench/Cargo.toml             | 2 +-
 mistralrs-pyo3/Cargo.toml              | 2 +-
 mistralrs-pyo3/Cargo_template.toml     | 2 +-
 mistralrs-pyo3/pyproject.toml          | 2 +-
 mistralrs-pyo3/pyproject_template.toml | 2 +-
 mistralrs-server/Cargo.toml            | 2 +-
 mistralrs/Cargo.toml                   | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 297b55b114..f9583c7f55 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "0.1.10"
+version = "0.1.11"
 edition = "2021"
 description = "Fast and easy LLM serving."
 homepage = "https://github.com/EricLBuehler/mistral.rs"
diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml
index 875f7b78b1..8f53baff22 100644
--- a/mistralrs-bench/Cargo.toml
+++ b/mistralrs-bench/Cargo.toml
@@ -17,7 +17,7 @@ candle-core.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 clap.workspace = true
-mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" }
 tracing.workspace = true
 either.workspace = true
 tokio.workspace = true
diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml
index cf32d63295..e9d2945442 100644
--- a/mistralrs-pyo3/Cargo.toml
+++ b/mistralrs-pyo3/Cargo.toml
@@ -17,7 +17,7 @@ doc = false
 
 [dependencies]
 pyo3.workspace = true
-mistralrs-core = { version = "0.1.10", path = "../mistralrs-core", features = ["pyo3_macros"] }
+mistralrs-core = { version = "0.1.11", path = "../mistralrs-core", features = ["pyo3_macros"] }
 serde.workspace = true
 serde_json.workspace = true
 candle-core.workspace = true
diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml
index 6944b192fd..a3a52a0eb1 100644
--- a/mistralrs-pyo3/Cargo_template.toml
+++ b/mistralrs-pyo3/Cargo_template.toml
@@ -17,7 +17,7 @@ doc = false
 
 [dependencies]
 pyo3.workspace = true
-mistralrs-core = { version = "0.1.10", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
+mistralrs-core = { version = "0.1.11", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] }
 serde.workspace = true
 serde_json.workspace = true
 candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0", features=["$feature_name"] }
diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml
index 3aa089e4c9..bf62a1349e 100644
--- a/mistralrs-pyo3/pyproject.toml
+++ b/mistralrs-pyo3/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "mistralrs"
-version = "0.1.10"
+version = "0.1.11"
 requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Rust",
diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml
index 01e9848235..b8afe9f18a 100644
--- a/mistralrs-pyo3/pyproject_template.toml
+++ b/mistralrs-pyo3/pyproject_template.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "$name"
-version = "0.1.10"
+version = "0.1.11"
 requires-python = ">=3.8"
 classifiers = [
     "Programming Language :: Rust",
diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml
index a7fa730c3f..86fe006fd0 100644
--- a/mistralrs-server/Cargo.toml
+++ b/mistralrs-server/Cargo.toml
@@ -22,7 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] }
 tower-http = { version = "0.5.1", features = ["cors"]}
 utoipa = { version = "4.2", features = ["axum_extras"] }
 utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]}
-mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" }
 dyn-fmt = "0.4.0"
 indexmap.workspace = true
 accelerate-src = { workspace = true, optional = true }
diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml
index 1ceed5ec12..d105982bd9 100644
--- a/mistralrs/Cargo.toml
+++ b/mistralrs/Cargo.toml
@@ -12,7 +12,7 @@ license.workspace = true
 homepage.workspace = true
 
 [dependencies]
-mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" }
+mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" }
 anyhow.workspace = true
 tokio.workspace = true
 candle-core.workspace = true

From cfe2fd3674c9853e8f3738d7705ed727586efa9d Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 20:10:38 -0400
Subject: [PATCH 21/23] Add examples readme

---
 examples/README.md | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 examples/README.md

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000000..043a2211d8
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,4 @@
+# Examples
+- Python: [examples here](python)
+- HTTP Server: [examples here](server)
+- Rust: [examples here](../mistralrs/examples/)
\ No newline at end of file

From ddba24b2813cb2614f9f40adacd59e96caabf917 Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Tue, 28 May 2024 21:12:23 -0400
Subject: [PATCH 22/23] Add an example and fixes

---
 chat_templates/llama2.json                    |  3 +
 chat_templates/llama3.json                    |  3 +
 chat_templates/mistral.json                   |  3 +
 chat_templates/phi3.json                      |  3 +
 mistralrs-core/src/pipeline/chat_template.rs  | 29 ++++----
 mistralrs-core/src/pipeline/gguf.rs           | 72 +++++++++++++------
 mistralrs-core/src/pipeline/gguf_tokenizer.rs | 21 +++++-
 mistralrs-core/src/pipeline/macros.rs         | 20 ++++--
 mistralrs-core/src/pipeline/mod.rs            | 12 ++--
 mistralrs/Cargo.toml                          |  4 ++
 mistralrs/examples/gguf_locally/main.rs       | 64 +++++++++++++++++
 mistralrs/examples/quantized/main.rs          |  1 +
 12 files changed, 188 insertions(+), 47 deletions(-)
 create mode 100644 chat_templates/llama2.json
 create mode 100644 chat_templates/llama3.json
 create mode 100644 chat_templates/mistral.json
 create mode 100644 chat_templates/phi3.json
 create mode 100644 mistralrs/examples/gguf_locally/main.rs

diff --git a/chat_templates/llama2.json b/chat_templates/llama2.json
new file mode 100644
index 0000000000..800a077f2c
--- /dev/null
+++ b/chat_templates/llama2.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}"
+}
\ No newline at end of file
diff --git a/chat_templates/llama3.json b/chat_templates/llama3.json
new file mode 100644
index 0000000000..61bafeb2ed
--- /dev/null
+++ b/chat_templates/llama3.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
+}
\ No newline at end of file
diff --git a/chat_templates/mistral.json b/chat_templates/mistral.json
new file mode 100644
index 0000000000..15544fda6b
--- /dev/null
+++ b/chat_templates/mistral.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}"
+}
\ No newline at end of file
diff --git a/chat_templates/phi3.json b/chat_templates/phi3.json
new file mode 100644
index 0000000000..6d92f29e6e
--- /dev/null
+++ b/chat_templates/phi3.json
@@ -0,0 +1,3 @@
+{
+    "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+}
\ No newline at end of file
diff --git a/mistralrs-core/src/pipeline/chat_template.rs b/mistralrs-core/src/pipeline/chat_template.rs
index ee7dfa1155..e419b8901b 100644
--- a/mistralrs-core/src/pipeline/chat_template.rs
+++ b/mistralrs-core/src/pipeline/chat_template.rs
@@ -30,9 +30,9 @@ fn raise_exception(msg: String) -> Result<String, minijinja::Error> {
 }
 
 #[derive(Debug, Deserialize)]
-pub struct Unk(#[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>);
-#[derive(Debug, Deserialize)]
-pub struct Bos(#[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>);
+pub struct BeginEndUnkTok(
+    #[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>,
+);
 
 #[allow(dead_code)]
 #[derive(Debug, Deserialize)]
@@ -41,23 +41,22 @@ pub struct ChatTemplate {
     add_eos_token: Option<bool>,
     added_tokens_decoder: Option<HashMap<String, AddedTokensDecoder>>,
     additional_special_tokens: Option<Vec<String>>,
-    pub bos_token: Option<Bos>,
+    pub bos_token: Option<BeginEndUnkTok>,
 
     /// Jinja format chat templating for chat completion.
     /// See: https://huggingface.co/docs/transformers/chat_templating
     pub chat_template: Option<String>,
     clean_up_tokenization_spaces: Option<bool>,
     device_map: Option<String>,
-    #[serde(with = "either::serde_untagged")]
-    pub eos_token: Either<String, AddedTokensDecoder>,
+    pub eos_token: Option<BeginEndUnkTok>,
     legacy: Option<bool>,
-    model_max_length: f64,
+    model_max_length: Option<f64>,
     pad_token: Option<String>,
     sp_model_kwargs: Option<HashMap<String, String>>,
     spaces_between_special_tokens: Option<bool>,
-    tokenizer_class: String,
+    tokenizer_class: Option<String>,
     truncation_size: Option<String>,
-    pub unk_token: Option<Unk>,
+    pub unk_token: Option<BeginEndUnkTok>,
     use_default_system_prompt: Option<bool>,
 }
 
@@ -66,10 +65,10 @@ impl ChatTemplate {
         self.chat_template.is_some()
     }
 
-    pub fn eos_tok(&self) -> String {
-        match self.eos_token {
-            Either::Left(ref lit) => lit.clone(),
-            Either::Right(ref added) => added.content.clone(),
+    pub fn eos_tok(&self) -> Option<String> {
+        match self.eos_token.as_ref()?.0 {
+            Either::Left(ref lit) => Some(lit.clone()),
+            Either::Right(ref added) => Some(added.content.clone()),
         }
     }
 
@@ -93,7 +92,7 @@ pub fn calculate_eos_tokens(
     gen_conf: Option<GenerationConfig>,
     tokenizer: &Tokenizer,
 ) -> Vec<u32> {
-    let mut eos_tok_ids = vec![chat_template.eos_tok()];
+    let mut eos_tok_ids = chat_template.eos_tok().map(|x| vec![x]).unwrap_or_default();
     let mut bos_tok_ids = chat_template.bos_tok().map(|b| vec![b]).unwrap_or_default();
 
     for alternate in SUPPORTED_ALTERNATE_EOS {
@@ -173,7 +172,7 @@ pub fn apply_chat_template_to(
     add_generation_prompt: bool,
     template: &str,
     bos_tok: Option<String>,
-    eos_tok: &str,
+    eos_tok: Option<String>,
     unk_tok: Option<String>,
 ) -> Result<String> {
     let mut env = Environment::new();
diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
index ae3bb9dcaa..71520b1d6d 100644
--- a/mistralrs-core/src/pipeline/gguf.rs
+++ b/mistralrs-core/src/pipeline/gguf.rs
@@ -6,12 +6,13 @@ use super::{
 use crate::aici::bintokens::build_tok_trie;
 use crate::aici::toktree::TokTrie;
 use crate::lora::Ordering;
-use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig};
-use crate::pipeline::gguf_tokenizer::convert_ggml_to_hf_tokenizer;
+use crate::pipeline::chat_template::{calculate_eos_tokens, BeginEndUnkTok, GenerationConfig};
+use crate::pipeline::gguf_tokenizer::{convert_ggml_to_hf_tokenizer, ConversionResult};
 use crate::pipeline::{get_chat_template, Cache};
 use crate::pipeline::{ChatTemplate, LocalModelPaths};
 use crate::prefix_cacher::PrefixCacheManager;
 use crate::sequence::Sequence;
+use crate::utils::tokenizer::get_tokenizer;
 use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters};
 use crate::xlora_models::NonGranularState;
 use crate::{do_sample, get_mut_arcmutex, get_paths_gguf, DeviceMapMetadata, DEBUG};
@@ -28,6 +29,7 @@ use candle_core::quantized::{
     GgmlDType,
 };
 use candle_core::{DType, Device, Tensor};
+use either::Either;
 use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
 use rand_isaac::Isaac64Rng;
 use std::fs;
@@ -61,10 +63,10 @@ pub struct GGUFPipeline {
 }
 
 pub struct GGUFLoader {
-    model_id: String,
+    model_id: Option<String>,
     config: GGUFSpecificConfig,
-    quantized_model_id: Option<String>,
-    quantized_filename: Option<String>,
+    quantized_model_id: String,
+    quantized_filename: String,
     xlora_model_id: Option<String>,
     xlora_order: Option<Ordering>,
     no_kv_cache: bool,
@@ -189,7 +191,7 @@ impl GGUFLoaderBuilder {
 
     pub fn build(self) -> Box<dyn Loader> {
         Box::new(GGUFLoader {
-            model_id: self.model_id.unwrap(),
+            model_id: self.model_id,
             config: self.config,
             xlora_model_id: self.xlora_model_id,
             kind: self.kind,
@@ -197,8 +199,8 @@ impl GGUFLoaderBuilder {
             no_kv_cache: self.no_kv_cache,
             chat_template: self.chat_template,
             tgt_non_granular_index: self.tgt_non_granular_index,
-            quantized_filename: Some(self.quantized_filename),
-            quantized_model_id: Some(self.quantized_model_id),
+            quantized_filename: self.quantized_filename,
+            quantized_model_id: self.quantized_model_id,
         })
     }
 }
@@ -208,8 +210,8 @@ impl GGUFLoader {
     pub fn new(
         model_id: Option<String>,
         config: GGUFSpecificConfig,
-        quantized_model_id: Option<String>,
-        quantized_filename: Option<String>,
+        quantized_model_id: String,
+        quantized_filename: String,
         xlora_model_id: Option<String>,
         kind: ModelKind,
         xlora_order: Option<Ordering>,
@@ -218,13 +220,15 @@ impl GGUFLoader {
         tgt_non_granular_index: Option<usize>,
     ) -> Self {
         let model_id = if let Some(id) = model_id {
-            id
-        } else {
+            Some(id)
+        } else if let Some(xlora_order) = xlora_order.clone() {
             info!(
                 "Using adapter base model ID: `{}`",
-                xlora_order.as_ref().unwrap().base_model_id
+                xlora_order.base_model_id
             );
-            xlora_order.as_ref().unwrap().base_model_id.clone()
+            Some(xlora_order.base_model_id.clone())
+        } else {
+            None
         };
         Self {
             model_id,
@@ -280,8 +284,8 @@ impl Loader for GGUFLoader {
             &token_source,
             revision,
             self,
-            self.quantized_model_id,
-            self.quantized_filename,
+            self.quantized_model_id.clone(),
+            self.quantized_filename.clone(),
             silent
         );
         self.load_model_from_path(&paths?, _dtype, device, silent, mapper, in_situ_quant)
@@ -356,7 +360,21 @@ impl Loader for GGUFLoader {
             info!("Debug is enabled, wrote the names and information about each tensor to `mistralrs_gguf_tensors.txt`.");
         }
 
-        let tokenizer = convert_ggml_to_hf_tokenizer(&model)?;
+        let ConversionResult {
+            tokenizer,
+            bos,
+            eos,
+            unk,
+        } = if paths.get_tokenizer_filename().to_string_lossy().is_empty() {
+            convert_ggml_to_hf_tokenizer(&model)?
+        } else {
+            ConversionResult {
+                tokenizer: get_tokenizer(paths.get_tokenizer_filename())?,
+                bos: None,
+                eos: None,
+                unk: None,
+            }
+        };
 
         let mut is_lora = false;
         let model = match self.kind {
@@ -481,7 +499,7 @@ impl Loader for GGUFLoader {
         let gen_conf: Option<GenerationConfig> = paths
             .get_gen_conf_filename()
             .map(|f| serde_json::from_str(&fs::read_to_string(f).unwrap()).unwrap());
-        let chat_template = get_chat_template(paths, &self.chat_template);
+        let mut chat_template = get_chat_template(paths, &self.chat_template);
 
         let max_seq_len = match model {
             Model::Llama(ref l) => l.max_seq_len,
@@ -502,6 +520,17 @@ impl Loader for GGUFLoader {
             Model::Phi3(ref model) => model.cache.lock().len(),
             Model::XLoraPhi3(ref model) => model.cache.lock().len(),
         };
+
+        if chat_template.bos_token.is_none() && bos.is_some() {
+            chat_template.bos_token = Some(BeginEndUnkTok(Either::Left(bos.unwrap())));
+        }
+        if chat_template.eos_token.is_none() && eos.is_some() {
+            chat_template.eos_token = Some(BeginEndUnkTok(Either::Left(eos.unwrap())));
+        }
+        if chat_template.unk_token.is_none() && unk.is_some() {
+            chat_template.unk_token = Some(BeginEndUnkTok(Either::Left(unk.unwrap())));
+        }
+
         let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer);
         Ok(Arc::new(Mutex::new(GGUFPipeline {
             model,
@@ -509,7 +538,10 @@ impl Loader for GGUFLoader {
             tokenizer: tokenizer.into(),
             no_kv_cache: self.no_kv_cache,
             chat_template: Arc::new(chat_template),
-            model_id: self.model_id.clone(),
+            model_id: self
+                .model_id
+                .clone()
+                .unwrap_or(self.quantized_model_id.clone()),
             non_granular_state: self.tgt_non_granular_index.map(|tgt_non_granular_index| {
                 NonGranularState {
                     non_granular_index: Arc::new(Mutex::new(0)),
@@ -532,7 +564,7 @@ impl Loader for GGUFLoader {
     fn get_id(&self) -> String {
         self.xlora_model_id
             .as_deref()
-            .unwrap_or(&self.model_id)
+            .unwrap_or(self.model_id.as_ref().unwrap_or(&self.quantized_model_id))
             .to_string()
     }
 
diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
index 1a8333616d..1d6985c1fc 100644
--- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs
+++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs
@@ -12,7 +12,14 @@ use tracing::info;
 
 use crate::DEBUG;
 
-pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
+pub struct ConversionResult {
+    pub tokenizer: Tokenizer,
+    pub bos: Option<String>,
+    pub eos: Option<String>,
+    pub unk: Option<String>,
+}
+
+pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<ConversionResult> {
     let model = content.metadata["tokenizer.ggml.model"]
         .to_string()
         .expect("GGUF tokenizer model is not a string.")
@@ -67,6 +74,10 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
         .to_u32()
         .expect("GGUF unk token is not u32");
 
+    let bos_str = tokens[bos as usize].clone();
+    let eos_str = tokens[eos as usize].clone();
+    let unk_str = tokens[unk as usize].clone();
+
     let (tokenizer, ty) = match model.as_str() {
         "llama" | "replit" => {
             // unigram
@@ -112,7 +123,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> {
     if DEBUG.load(Ordering::Relaxed) {
         info!("Tokenizer: {tokenizer:?}");
     }
-    Ok(tokenizer)
+    Ok(ConversionResult {
+        tokenizer,
+        bos: Some(bos_str),
+        eos: Some(eos_str),
+        unk: Some(unk_str),
+    })
 }
 
 mod tests {
@@ -152,6 +168,7 @@ mod tests {
                         .map_err(anyhow::Error::msg)?,
                 )
                 .map_err(anyhow::Error::msg)
+                .map(|res| res.tokenizer)
             }
             other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"),
         }
diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs
index 7f8f663d59..6e29c940cd 100644
--- a/mistralrs-core/src/pipeline/macros.rs
+++ b/mistralrs-core/src/pipeline/macros.rs
@@ -146,12 +146,14 @@ macro_rules! get_paths_gguf {
             .with_token(get_token($token_source)?)
             .build()?;
         let revision = $revision.unwrap_or("main".to_string());
+        let model_id_this = $this.model_id.clone().unwrap_or($this.quantized_model_id.clone());
+        let model_id_copy = model_id_this.clone();
         let api = api.repo(Repo::with_revision(
-            $this.model_id.clone(),
+            model_id_this.clone(),
             RepoType::Model,
             revision.clone(),
         ));
-        let model_id = std::path::Path::new(&$this.model_id);
+        let model_id = std::path::Path::new(&model_id_copy);
 
         let chat_template = if let Some(ref p) = $this.chat_template {
             if p.ends_with(".json") {
@@ -171,8 +173,8 @@ macro_rules! get_paths_gguf {
         let filenames = get_model_paths(
             revision.clone(),
             &$token_source,
-            &$quantized_model_id,
-            &$quantized_filename,
+            &Some($quantized_model_id),
+            &Some($quantized_filename),
             &api,
             &model_id,
         )?;
@@ -185,7 +187,7 @@ macro_rules! get_paths_gguf {
             xlora_config,
             lora_preload_adapter_info,
         } = get_xlora_paths(
-            $this.model_id.clone(),
+            model_id_this,
             &$this.xlora_model_id,
             &$token_source,
             revision.clone(),
@@ -205,8 +207,14 @@ macro_rules! get_paths_gguf {
             None
         };
 
+        let tokenizer_filename = if $this.model_id.is_some() {
+            $crate::api_get_file!(api, "tokenizer.json", model_id)
+        } else {
+            PathBuf::from_str("")?
+        };
+
         Ok(Box::new($path_name {
-            tokenizer_filename: PathBuf::from_str("")?,
+            tokenizer_filename,
             config_filename: PathBuf::from_str("")?,
             filenames,
             xlora_adapter_configs: adapter_configs,
diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs
index 9d7dbee835..5dae166a37 100644
--- a/mistralrs-core/src/pipeline/mod.rs
+++ b/mistralrs-core/src/pipeline/mod.rs
@@ -598,9 +598,13 @@ pub trait Pipeline: Send + Sync {
         } else {
             None
         };
-        let eos_tok = match chat_template.eos_token {
-            Either::Left(ref lit) => lit,
-            Either::Right(ref added) => &added.content,
+        let eos_tok = if let Some(ref unk) = self.get_chat_template().eos_token {
+            match unk.0 {
+                Either::Left(ref lit) => Some(lit.to_string()),
+                Either::Right(ref added) => Some(added.content.to_string()),
+            }
+        } else {
+            None
         };
         let unk_tok = if let Some(ref unk) = self.get_chat_template().unk_token {
             match unk.0 {
@@ -1436,7 +1440,7 @@ mod tests {
                 true,
                 template,
                 Some(bos.to_string()),
-                eos,
+                Some(eos.to_string()),
                 Some(unk.to_string()),
             )
             .unwrap_or_else(|_| panic!("Template number {i}"));
diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml
index d105982bd9..49027c7e05 100644
--- a/mistralrs/Cargo.toml
+++ b/mistralrs/Cargo.toml
@@ -52,4 +52,8 @@ required-features = []
 
 [[example]]
 name = "lora_activation"
+required-features = []
+
+[[example]]
+name = "gguf_locally"
 required-features = []
\ No newline at end of file
diff --git a/mistralrs/examples/gguf_locally/main.rs b/mistralrs/examples/gguf_locally/main.rs
new file mode 100644
index 0000000000..b04fc9fa53
--- /dev/null
+++ b/mistralrs/examples/gguf_locally/main.rs
@@ -0,0 +1,64 @@
+use std::sync::Arc;
+use tokio::sync::mpsc::channel;
+
+use mistralrs::{
+    Constraint, Device, DeviceMapMetadata, GGUFLoaderBuilder, GGUFSpecificConfig, MistralRs,
+    MistralRsBuilder, NormalRequest, Request, RequestMessage, Response, SamplingParams,
+    SchedulerMethod, TokenSource,
+};
+
+fn setup() -> anyhow::Result<Arc<MistralRs>> {
+    // Select a Mistral model
+    // We do not use any files from HF servers here, and instead load the
+    // chat template from the specified file, and the tokenizer and model from a
+    // local GGUF file at the path `.`
+    let loader = GGUFLoaderBuilder::new(
+        GGUFSpecificConfig { repeat_last_n: 64 },
+        Some("chat_templates/mistral.json".to_string()),
+        None,
+        ".".to_string(),
+        "mistral-7b-instruct-v0.1.Q4_K_M.gguf".to_string(),
+    )
+    .build();
+    // Load, into a Pipeline
+    let pipeline = loader.load_model_from_hf(
+        None,
+        TokenSource::CacheToken,
+        None,
+        &Device::cuda_if_available(0)?,
+        false,
+        DeviceMapMetadata::dummy(),
+        None,
+    )?;
+    // Create the MistralRs, which is a runner
+    Ok(MistralRsBuilder::new(pipeline, SchedulerMethod::Fixed(5.try_into().unwrap())).build())
+}
+
+fn main() -> anyhow::Result<()> {
+    let mistralrs = setup()?;
+
+    let (tx, mut rx) = channel(10_000);
+    let request = Request::Normal(NormalRequest {
+        messages: RequestMessage::Completion {
+            text: "Hello! My name is ".to_string(),
+            echo_prompt: false,
+            best_of: 1,
+        },
+        sampling_params: SamplingParams::default(),
+        response: tx,
+        return_logprobs: false,
+        is_streaming: false,
+        id: 0,
+        constraint: Constraint::None,
+        suffix: None,
+        adapters: None,
+    });
+    mistralrs.get_sender().blocking_send(request)?;
+
+    let response = rx.blocking_recv().unwrap();
+    match response {
+        Response::CompletionDone(c) => println!("Text: {}", c.choices[0].text),
+        _ => unreachable!(),
+    }
+    Ok(())
+}
diff --git a/mistralrs/examples/quantized/main.rs b/mistralrs/examples/quantized/main.rs
index 58f1ac92b4..b6539edaf2 100644
--- a/mistralrs/examples/quantized/main.rs
+++ b/mistralrs/examples/quantized/main.rs
@@ -9,6 +9,7 @@ use mistralrs::{
 
 fn setup() -> anyhow::Result<Arc<MistralRs>> {
     // Select a Mistral model
+    // This uses a model, tokenizer, and chat template, from HF hub.
     let loader = GGUFLoaderBuilder::new(
         GGUFSpecificConfig { repeat_last_n: 64 },
         None,

From 813d83211e1391060261adde8de15b48cbe814cf Mon Sep 17 00:00:00 2001
From: EricLBuehler <ericlbuehler@gmail.com>
Date: Wed, 29 May 2024 10:04:21 -0400
Subject: [PATCH 23/23] Allow unauth for local

---
 mistralrs-core/src/pipeline/macros.rs | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs
index 6e29c940cd..b9bf402f2f 100644
--- a/mistralrs-core/src/pipeline/macros.rs
+++ b/mistralrs-core/src/pipeline/macros.rs
@@ -11,14 +11,21 @@ macro_rules! api_dir_list {
             .unwrap_or_else(|e| {
                 // If we do not get a 404, it was something else.
                 let format = format!("{e:?}");
+                let mut unauth = false;
                 if let hf_hub::api::sync::ApiError::RequestError(resp) = e {
-                    if resp.into_response().is_some_and(|r| r.status() != 404) {
+                    let resp = resp.into_response();
+                    // If it's 401, assume that we're running locally only.
+                    if resp.as_ref().is_some_and(|r| r.status() != 401) {
+                        unauth = true;
+                    } else if resp.as_ref().is_some_and(|r| r.status() != 404) {
                         panic!("{format}");
                     }
                 }
 
                 let listing = std::fs::read_dir($model_id);
-                if listing.is_err() {
+                if listing.is_err() && unauth {
+                    panic!("{format}");
+                } else if listing.is_err() {
                     panic!("Cannot list directory {:?}", $model_id)
                 }
                 let listing = listing.unwrap();
@@ -43,14 +50,21 @@ macro_rules! api_get_file {
         $api.get($file).unwrap_or_else(|e| {
             // If we do not get a 404, it was something else.
             let format = format!("{e:?}");
+            let mut unauth = false;
             if let hf_hub::api::sync::ApiError::RequestError(resp) = e {
-                if resp.into_response().is_some_and(|r| r.status() != 404) {
+                let resp = resp.into_response();
+                // If it's 401, assume that we're running locally only.
+                if resp.as_ref().is_some_and(|r| r.status() != 401) {
+                    unauth = true;
+                } else if resp.as_ref().is_some_and(|r| r.status() != 404) {
                     panic!("{format}");
                 }
             }
 
             let path = $model_id.join($file);
-            if !path.exists() {
+            if !path.exists() && unauth {
+                panic!("{format}");
+            } else if !path.exists() {
                 panic!("File \"{}\" not found at model id {:?}", $file, $model_id)
             }
             info!("Loading `{:?}` locally at `{path:?}`", &$file);