From 1c2f5d7a1e67d0f5e1f0287bedadb8b2dbc175b7 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Sat, 25 May 2024 10:56:23 -0400 Subject: [PATCH 01/23] Add automatic conversion from gguf to hf tokenizer --- mistralrs-core/src/pipeline/gguf.rs | 6 +- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 92 +++++++++++++++++++ mistralrs-core/src/pipeline/mod.rs | 1 + 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 mistralrs-core/src/pipeline/gguf_tokenizer.rs diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs index eb3a7ac09c..ae40ec16ae 100644 --- a/mistralrs-core/src/pipeline/gguf.rs +++ b/mistralrs-core/src/pipeline/gguf.rs @@ -7,11 +7,11 @@ use crate::aici::bintokens::build_tok_trie; use crate::aici::toktree::TokTrie; use crate::lora::Ordering; use crate::pipeline::chat_template::calculate_eos_tokens; +use crate::pipeline::gguf_tokenizer::convert_ggml_to_hf_tokenizer; use crate::pipeline::Cache; use crate::pipeline::{ChatTemplate, LocalModelPaths}; use crate::prefix_cacher::PrefixCacheManager; use crate::sequence::Sequence; -use crate::utils::tokenizer::get_tokenizer; use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters}; use crate::xlora_models::NonGranularState; use crate::{deserialize_chat_template, do_sample, get_mut_arcmutex, get_paths, DeviceMapMetadata}; @@ -329,6 +329,8 @@ impl Loader for GGUFLoader { } } + let tokenizer = convert_ggml_to_hf_tokenizer(&model)?; + let mut is_lora = false; let model = match self.kind { ModelKind::QuantizedGGUF => match arch { @@ -449,8 +451,6 @@ impl Loader for GGUFLoader { _ => unreachable!(), }; - let tokenizer = get_tokenizer(paths.get_tokenizer_filename())?; - let (chat_template, gen_conf) = deserialize_chat_template!(paths, self); let max_seq_len = match model { diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs new file mode 100644 index 0000000000..246c7bbd94 --- /dev/null +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -0,0 +1,92 @@ +use std::collections::HashMap; + +use anyhow::Result; +use candle_core::quantized::gguf_file::Content; +use tokenizers::{models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer}; + +pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { + let model = content.metadata["tokenizer.ggml.model"] + .to_string() + .expect("GGUF tokenizer model is not a string.") + .clone(); + let tokens = content.metadata["tokenizer.ggml.tokens"] + .to_vec() + .expect("GGUF tokenizer tokens is not a vec.") + .iter() + .map(|t| t.to_string().expect("GGUF token is not a string.").clone()) + .collect::<Vec<_>>(); + let added_tokens = content + .metadata + .get("tokenizer.ggml.added_tokens") + .map(|items| { + items + .to_vec() + .expect("GGUF tokenizer added_tokens is not a vec.") + .iter() + .map(|t| { + t.to_string() + .expect("GGUF added_token is not a string.") + .clone() + }) + .collect::<Vec<_>>() + }); + let merges = content.metadata.get("tokenizer.ggml.merges").map(|items| { + items + .to_vec() + .expect("GGUF tokenizer merges is not a vec.") + .iter() + .map(|t| t.to_string().expect("GGUF merges is not a string.").clone()) + .collect::<Vec<_>>() + }); + + let _bos = content.metadata["tokenizer.ggml.bos_token_id"] + .to_u32() + .expect("GGUF bos token is not u32"); + let _eos = content.metadata["tokenizer.ggml.eos_token_id"] + .to_u32() + .expect("GGUF eos token is not u32"); + let unk = content.metadata["tokenizer.ggml.unknown_token_id"] + .to_u32() + .expect("GGUF unk token is not u32"); + let _sep = content.metadata["tokenizer.ggml.separator_token_id"] + .to_u32() + .expect("GGUF sep token is not u32"); + let _pad = content.metadata["tokenizer.ggml.padding_token_id"] + .to_u32() + .expect("GGUF pad token is not u32"); + + let tokenizer = match model.as_str() { + "llama" | "replit" | "gpt2" | "rwkv" => { + // BPE, as seen in relevant tokenizer.json files + let bpe_builder = BpeBuilder::new().unk_token(tokens[unk as usize].clone()); + + let mut vocab = HashMap::new(); + for (i, tok) in tokens.into_iter().enumerate() { + #[allow(clippy::cast_possible_truncation)] + vocab.insert(tok, i as u32); + } + let mut merges_vec = Vec::new(); + if let Some(merges) = merges { + for tok in merges { + let split = tok.splitn(2, ' ').collect::<Vec<_>>(); + merges_vec.push((split[0].to_string(), split[1].to_string())); + } + } + let bpe = bpe_builder + .vocab_and_merges(vocab, merges_vec) + .build() + .map_err(anyhow::Error::msg)?; + let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe)); + if let Some(added_tokens) = added_tokens { + for added_token in added_tokens { + tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]); + } + } + tokenizer + } + other => { + anyhow::bail!("Tokenizer model `{other}` not supported."); + } + }; + Ok(tokenizer) +} diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index c2c5512ff3..6b61dd0ea2 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -2,6 +2,7 @@ mod cache_manager; mod chat_template; mod ggml; mod gguf; +mod gguf_tokenizer; mod loaders; mod macros; mod normal; From b3ac5c80e3d98e5572a9e28544984733365ab4fa Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Sat, 25 May 2024 11:00:24 -0400 Subject: [PATCH 02/23] Add info messages --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 246c7bbd94..27c2cdf6a6 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use anyhow::Result; use candle_core::quantized::gguf_file::Content; use tokenizers::{models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer}; +use tracing::info; pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { let model = content.metadata["tokenizer.ggml.model"] @@ -39,6 +40,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .collect::<Vec<_>>() }); + info!( + "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}", + tokens.len(), + added_tokens.as_ref().map(|x| x.len()).unwrap_or(0), + merges.as_ref().map(|x| x.len()).unwrap_or(0) + ); let _bos = content.metadata["tokenizer.ggml.bos_token_id"] .to_u32() .expect("GGUF bos token is not u32"); @@ -59,6 +66,7 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { "llama" | "replit" | "gpt2" | "rwkv" => { // BPE, as seen in relevant tokenizer.json files let bpe_builder = BpeBuilder::new().unk_token(tokens[unk as usize].clone()); + info!("Loading as BPE tokenizer."); let mut vocab = HashMap::new(); for (i, tok) in tokens.into_iter().enumerate() { From 36c46cc602933b96c8627ae2bab9d9d862a9ba0d Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Sat, 25 May 2024 11:12:01 -0400 Subject: [PATCH 03/23] Add decoder to tokenizer --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 27c2cdf6a6..66fdf4349f 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -2,7 +2,9 @@ use std::collections::HashMap; use anyhow::Result; use candle_core::quantized::gguf_file::Content; -use tokenizers::{models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer}; +use tokenizers::{ + decoders::bpe::BPEDecoder, models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer, +}; use tracing::info; pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { @@ -46,21 +48,9 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { added_tokens.as_ref().map(|x| x.len()).unwrap_or(0), merges.as_ref().map(|x| x.len()).unwrap_or(0) ); - let _bos = content.metadata["tokenizer.ggml.bos_token_id"] - .to_u32() - .expect("GGUF bos token is not u32"); - let _eos = content.metadata["tokenizer.ggml.eos_token_id"] - .to_u32() - .expect("GGUF eos token is not u32"); let unk = content.metadata["tokenizer.ggml.unknown_token_id"] .to_u32() .expect("GGUF unk token is not u32"); - let _sep = content.metadata["tokenizer.ggml.separator_token_id"] - .to_u32() - .expect("GGUF sep token is not u32"); - let _pad = content.metadata["tokenizer.ggml.padding_token_id"] - .to_u32() - .expect("GGUF pad token is not u32"); let tokenizer = match model.as_str() { "llama" | "replit" | "gpt2" | "rwkv" => { @@ -85,6 +75,7 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .build() .map_err(anyhow::Error::msg)?; let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe)); + tokenizer.with_decoder(BPEDecoder::default()); if let Some(added_tokens) = added_tokens { for added_token in added_tokens { tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]); From be2fca1be34856c0e64bbb1a5f16922707d206a2 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Sat, 25 May 2024 16:47:37 -0400 Subject: [PATCH 04/23] More progress, its horrifying --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 58 +++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 66fdf4349f..01e7004639 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -3,7 +3,11 @@ use std::collections::HashMap; use anyhow::Result; use candle_core::quantized::gguf_file::Content; use tokenizers::{ - decoders::bpe::BPEDecoder, models::bpe::BpeBuilder, AddedToken, ModelWrapper, Tokenizer, + decoders::{byte_fallback::ByteFallback, fuse::Fuse, sequence::Sequence, strip::Strip}, + models::bpe::BpeBuilder, + normalizers::{self, Prepend, Replace}, + processors::template::{self, Template, TemplateProcessing, Tokens}, + AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer, }; use tracing::info; @@ -52,6 +56,14 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .to_u32() .expect("GGUF unk token is not u32"); + let eos = content.metadata["tokenizer.ggml.eos_token_id"] + .to_u32() + .expect("GGUF unk token is not u32"); + + let bos = content.metadata["tokenizer.ggml.bos_token_id"] + .to_u32() + .expect("GGUF unk token is not u32"); + let tokenizer = match model.as_str() { "llama" | "replit" | "gpt2" | "rwkv" => { // BPE, as seen in relevant tokenizer.json files @@ -59,9 +71,9 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { info!("Loading as BPE tokenizer."); let mut vocab = HashMap::new(); - for (i, tok) in tokens.into_iter().enumerate() { + for (i, tok) in tokens.iter().enumerate() { #[allow(clippy::cast_possible_truncation)] - vocab.insert(tok, i as u32); + vocab.insert(tok.clone(), i as u32); } let mut merges_vec = Vec::new(); if let Some(merges) = merges { @@ -72,15 +84,53 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { } let bpe = bpe_builder .vocab_and_merges(vocab, merges_vec) + .fuse_unk(true) .build() .map_err(anyhow::Error::msg)?; let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe)); - tokenizer.with_decoder(BPEDecoder::default()); + tokenizer.with_decoder(Sequence::new(vec![ + DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?), + DecoderWrapper::ByteFallback(ByteFallback::default()), + DecoderWrapper::Fuse(Fuse::new()), + DecoderWrapper::Strip(Strip::new(' ', 1, 0)), + ])); if let Some(added_tokens) = added_tokens { for added_token in added_tokens { tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]); } } + tokenizer.add_special_tokens(&[AddedToken::from(tokens[bos as usize].clone(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from(tokens[eos as usize].clone(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from(tokens[unk as usize].clone(), true)]); + + tokenizer.with_post_processor( + TemplateProcessing::builder() + .special_tokens(Tokens::from(vec![template::SpecialToken::new( + tokens[bos as usize].clone(), + vec![bos], + vec![tokens[bos as usize].clone()], + ) + .map_err(anyhow::Error::msg)?])) + .pair( + Template::try_from(vec![ + tokens[bos as usize].clone(), + "$A".to_string(), + tokens[bos as usize].clone(), + "$B:1".to_string(), + ]) + .unwrap(), + ) + .single( + Template::try_from(vec![tokens[bos as usize].clone(), "$A".to_string()]) + .unwrap(), + ) + .build()?, + ); + tokenizer.with_normalizer(normalizers::Sequence::new(vec![ + NormalizerWrapper::Prepend(Prepend::new("▁".to_string())), + NormalizerWrapper::Replace(Replace::new(" ", "▁").map_err(anyhow::Error::msg)?), + ])); + info!("Decoder is: {:?}", tokenizer.get_decoder()); tokenizer } other => { From ba44cca98cba1ba53e9370ca1941fee5bda9f617 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 05:19:01 -0400 Subject: [PATCH 05/23] Merge --- mistralrs-core/src/pipeline/gguf.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs index 5b1f1ca65d..6e104836c1 100644 --- a/mistralrs-core/src/pipeline/gguf.rs +++ b/mistralrs-core/src/pipeline/gguf.rs @@ -482,8 +482,6 @@ impl Loader for GGUFLoader { _ => unreachable!(), }; - let tokenizer = get_tokenizer(paths.get_tokenizer_filename())?; - let gen_conf: Option<GenerationConfig> = paths .get_gen_conf_filename() .map(|f| serde_json::from_str(&fs::read_to_string(f).unwrap()).unwrap()); From b276c160189f8e900efd7213acb031b3061f82de Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 07:17:56 -0400 Subject: [PATCH 06/23] Use unigram tokenizer for llama --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 102 ++++-------------- 1 file changed, 23 insertions(+), 79 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 01e7004639..697f0f320b 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -1,14 +1,6 @@ -use std::collections::HashMap; - use anyhow::Result; use candle_core::quantized::gguf_file::Content; -use tokenizers::{ - decoders::{byte_fallback::ByteFallback, fuse::Fuse, sequence::Sequence, strip::Strip}, - models::bpe::BpeBuilder, - normalizers::{self, Prepend, Replace}, - processors::template::{self, Template, TemplateProcessing, Tokens}, - AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer, -}; +use tokenizers::{models::unigram::Unigram, ModelWrapper, Tokenizer}; use tracing::info; pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { @@ -37,6 +29,14 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { }) .collect::<Vec<_>>() }); + let scores = content.metadata.get("tokenizer.ggml.scores").map(|items| { + items + .to_vec() + .expect("GGUF tokenizer scores is not a vec.") + .iter() + .map(|t| t.to_f32().expect("GGUF score is not a f32.")) + .collect::<Vec<_>>() + }); let merges = content.metadata.get("tokenizer.ggml.merges").map(|items| { items .to_vec() @@ -47,91 +47,35 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { }); info!( - "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}", + "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}", tokens.len(), added_tokens.as_ref().map(|x| x.len()).unwrap_or(0), - merges.as_ref().map(|x| x.len()).unwrap_or(0) + merges.as_ref().map(|x| x.len()).unwrap_or(0), + scores.as_ref().map(|x| x.len()).unwrap_or(0) ); let unk = content.metadata["tokenizer.ggml.unknown_token_id"] .to_u32() .expect("GGUF unk token is not u32"); - let eos = content.metadata["tokenizer.ggml.eos_token_id"] + let _eos = content.metadata["tokenizer.ggml.eos_token_id"] .to_u32() .expect("GGUF unk token is not u32"); - let bos = content.metadata["tokenizer.ggml.bos_token_id"] + let _bos = content.metadata["tokenizer.ggml.bos_token_id"] .to_u32() .expect("GGUF unk token is not u32"); let tokenizer = match model.as_str() { - "llama" | "replit" | "gpt2" | "rwkv" => { - // BPE, as seen in relevant tokenizer.json files - let bpe_builder = BpeBuilder::new().unk_token(tokens[unk as usize].clone()); - info!("Loading as BPE tokenizer."); - - let mut vocab = HashMap::new(); - for (i, tok) in tokens.iter().enumerate() { - #[allow(clippy::cast_possible_truncation)] - vocab.insert(tok.clone(), i as u32); - } - let mut merges_vec = Vec::new(); - if let Some(merges) = merges { - for tok in merges { - let split = tok.splitn(2, ' ').collect::<Vec<_>>(); - merges_vec.push((split[0].to_string(), split[1].to_string())); - } + "llama" => { + let scores = + scores.expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer."); + let mut vocab = Vec::new(); + for (token, score) in tokens.into_iter().zip(scores) { + vocab.push((token, score as f64)); } - let bpe = bpe_builder - .vocab_and_merges(vocab, merges_vec) - .fuse_unk(true) - .build() - .map_err(anyhow::Error::msg)?; - let mut tokenizer = Tokenizer::new(ModelWrapper::BPE(bpe)); - tokenizer.with_decoder(Sequence::new(vec![ - DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?), - DecoderWrapper::ByteFallback(ByteFallback::default()), - DecoderWrapper::Fuse(Fuse::new()), - DecoderWrapper::Strip(Strip::new(' ', 1, 0)), - ])); - if let Some(added_tokens) = added_tokens { - for added_token in added_tokens { - tokenizer.add_special_tokens(&[AddedToken::from(added_token, true)]); - } - } - tokenizer.add_special_tokens(&[AddedToken::from(tokens[bos as usize].clone(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from(tokens[eos as usize].clone(), true)]); - tokenizer.add_special_tokens(&[AddedToken::from(tokens[unk as usize].clone(), true)]); - - tokenizer.with_post_processor( - TemplateProcessing::builder() - .special_tokens(Tokens::from(vec![template::SpecialToken::new( - tokens[bos as usize].clone(), - vec![bos], - vec![tokens[bos as usize].clone()], - ) - .map_err(anyhow::Error::msg)?])) - .pair( - Template::try_from(vec![ - tokens[bos as usize].clone(), - "$A".to_string(), - tokens[bos as usize].clone(), - "$B:1".to_string(), - ]) - .unwrap(), - ) - .single( - Template::try_from(vec![tokens[bos as usize].clone(), "$A".to_string()]) - .unwrap(), - ) - .build()?, - ); - tokenizer.with_normalizer(normalizers::Sequence::new(vec![ - NormalizerWrapper::Prepend(Prepend::new("▁".to_string())), - NormalizerWrapper::Replace(Replace::new(" ", "▁").map_err(anyhow::Error::msg)?), - ])); - info!("Decoder is: {:?}", tokenizer.get_decoder()); - tokenizer + let unigram = + Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?; + Tokenizer::new(ModelWrapper::Unigram(unigram)) } other => { anyhow::bail!("Tokenizer model `{other}` not supported."); From 1e31df7835279c4a263b932c39aa2d3184fb75cb Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 07:23:01 -0400 Subject: [PATCH 07/23] Logging --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 697f0f320b..783b3fb167 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -81,5 +81,6 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { anyhow::bail!("Tokenizer model `{other}` not supported."); } }; + info!("GGUF tokenizer model is `{model}`: {tokenizer:?}."); Ok(tokenizer) } From dd5a855b24dff1783fceba7b3707a603b2421b5f Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 07:39:56 -0400 Subject: [PATCH 08/23] Implement for llama and replit --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 783b3fb167..8e15911f9d 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -1,6 +1,11 @@ use anyhow::Result; use candle_core::quantized::gguf_file::Content; -use tokenizers::{models::unigram::Unigram, ModelWrapper, Tokenizer}; +use tokenizers::{ + decoders::{byte_fallback::ByteFallback, sequence::Sequence, strip::Strip}, + models::unigram::Unigram, + normalizers::Replace, + DecoderWrapper, ModelWrapper, Tokenizer, +}; use tracing::info; pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { @@ -66,7 +71,8 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .expect("GGUF unk token is not u32"); let tokenizer = match model.as_str() { - "llama" => { + "llama" | "replit" => { + // unigram let scores = scores.expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer."); let mut vocab = Vec::new(); @@ -75,7 +81,13 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { } let unigram = Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?; - Tokenizer::new(ModelWrapper::Unigram(unigram)) + let mut tokenizer = Tokenizer::new(ModelWrapper::Unigram(unigram)); + tokenizer.with_decoder(Sequence::new(vec![ + DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?), + DecoderWrapper::ByteFallback(ByteFallback::new()), + DecoderWrapper::Strip(Strip::new(' ', 1, 0)), + ])); + tokenizer } other => { anyhow::bail!("Tokenizer model `{other}` not supported."); From d68522cf25376f1e5c7de60bf17d54b549be14dd Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 07:43:15 -0400 Subject: [PATCH 09/23] Better logging --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 8e15911f9d..42fb9a40b5 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -70,7 +70,7 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .to_u32() .expect("GGUF unk token is not u32"); - let tokenizer = match model.as_str() { + let (tokenizer, ty) = match model.as_str() { "llama" | "replit" => { // unigram let scores = @@ -87,12 +87,16 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { DecoderWrapper::ByteFallback(ByteFallback::new()), DecoderWrapper::Strip(Strip::new(' ', 1, 0)), ])); - tokenizer + (tokenizer, "unigram") } other => { anyhow::bail!("Tokenizer model `{other}` not supported."); } }; - info!("GGUF tokenizer model is `{model}`: {tokenizer:?}."); + info!( + "GGUF tokenizer model is `{model}`, num vocab: {}, kind: `{}`", + tokenizer.get_vocab_size(true), + ty + ); Ok(tokenizer) } From d366d2aba0fa5ae1d8c96268480ec71db7d657de Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 07:46:47 -0400 Subject: [PATCH 10/23] Nicer logging --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 42fb9a40b5..47eaef3ec8 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -51,13 +51,6 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .collect::<Vec<_>>() }); - info!( - "Converting GGML tokenizer. Model: `{model}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}", - tokens.len(), - added_tokens.as_ref().map(|x| x.len()).unwrap_or(0), - merges.as_ref().map(|x| x.len()).unwrap_or(0), - scores.as_ref().map(|x| x.len()).unwrap_or(0) - ); let unk = content.metadata["tokenizer.ggml.unknown_token_id"] .to_u32() .expect("GGUF unk token is not u32"); @@ -73,11 +66,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { let (tokenizer, ty) = match model.as_str() { "llama" | "replit" => { // unigram - let scores = - scores.expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer."); + let scores = scores + .as_ref() + .expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer."); let mut vocab = Vec::new(); for (token, score) in tokens.into_iter().zip(scores) { - vocab.push((token, score as f64)); + vocab.push((token, *score as f64)); } let unigram = Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?; @@ -94,9 +88,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { } }; info!( - "GGUF tokenizer model is `{model}`, num vocab: {}, kind: `{}`", + "GGUF tokenizer model is `{model}`, kind: `{}`, num tokens: {}, num added tokens: {}, num merges: {}, num scores: {}", + ty, tokenizer.get_vocab_size(true), - ty + added_tokens.as_ref().map(|x| x.len()).unwrap_or(0), + merges.as_ref().map(|x| x.len()).unwrap_or(0), + scores.as_ref().map(|x| x.len()).unwrap_or(0) ); Ok(tokenizer) } From 3d416a7ffb50434d4ac2bb2b85c07315c4d56f6a Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 07:47:30 -0400 Subject: [PATCH 11/23] Update for verbose mode --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 47eaef3ec8..b693bb1544 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -1,3 +1,5 @@ +use std::sync::atomic::Ordering; + use anyhow::Result; use candle_core::quantized::gguf_file::Content; use tokenizers::{ @@ -8,6 +10,8 @@ use tokenizers::{ }; use tracing::info; +use crate::DEBUG; + pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { let model = content.metadata["tokenizer.ggml.model"] .to_string() @@ -95,5 +99,8 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { merges.as_ref().map(|x| x.len()).unwrap_or(0), scores.as_ref().map(|x| x.len()).unwrap_or(0) ); + if DEBUG.load(Ordering::Relaxed) { + info!("Tokenizer: {tokenizer:?}"); + } Ok(tokenizer) } From 19cf0288392aa8e2587997f6a62afc3aaf045ae3 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 08:34:02 -0400 Subject: [PATCH 12/23] Allow fully local loading for gguf --- mistralrs-core/src/model_loader.rs | 6 -- mistralrs-core/src/model_selected.rs | 24 +++----- mistralrs-core/src/pipeline/gguf.rs | 18 +++--- mistralrs-core/src/pipeline/macros.rs | 83 +++++++++++++++++++++++++++ mistralrs-core/src/pipeline/mod.rs | 16 +++++- mistralrs-core/src/toml_selector.rs | 15 +++-- mistralrs-pyo3/API.md | 12 +++- mistralrs-pyo3/mistralrs.pyi | 3 - mistralrs-pyo3/src/lib.rs | 6 -- mistralrs-pyo3/src/which.rs | 3 - mistralrs/examples/quantized/main.rs | 1 - 11 files changed, 131 insertions(+), 56 deletions(-) diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs index b7438b0f08..3ab381ad97 100644 --- a/mistralrs-core/src/model_loader.rs +++ b/mistralrs-core/src/model_loader.rs @@ -150,14 +150,12 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa .build(arch), ModelSelected::GGUF { tok_model_id, - tokenizer_json, quantized_model_id, quantized_filename, repeat_last_n, } => GGUFLoaderBuilder::new( GGUFSpecificConfig { repeat_last_n }, args.chat_template, - tokenizer_json, Some(tok_model_id), quantized_model_id, quantized_filename, @@ -165,7 +163,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa .build(), ModelSelected::XLoraGGUF { tok_model_id, - tokenizer_json, quantized_model_id, quantized_filename, repeat_last_n, @@ -175,7 +172,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa } => GGUFLoaderBuilder::new( GGUFSpecificConfig { repeat_last_n }, args.chat_template, - tokenizer_json, tok_model_id, quantized_model_id, quantized_filename, @@ -192,7 +188,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa .build(), ModelSelected::LoraGGUF { tok_model_id, - tokenizer_json, quantized_model_id, quantized_filename, repeat_last_n, @@ -201,7 +196,6 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa } => GGUFLoaderBuilder::new( GGUFSpecificConfig { repeat_last_n }, args.chat_template, - tokenizer_json, tok_model_id, quantized_model_id, quantized_filename, diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs index 6642c3f8fc..1bf68939d6 100644 --- a/mistralrs-core/src/model_selected.rs +++ b/mistralrs-core/src/model_selected.rs @@ -95,14 +95,12 @@ pub enum ModelSelected { /// Select a GGUF model. GGUF { - /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path. + /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file. + /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, + /// removing all remote accesses. #[arg(short, long)] tok_model_id: String, - /// Path to local tokenizer.json file. If this is specified it is used over any remote file. - #[arg(long)] - tokenizer_json: Option<String>, - /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. /// This may be a HF hub repo or a local path. #[arg(short = 'm', long)] @@ -119,14 +117,12 @@ pub enum ModelSelected { /// Select a GGUF model with X-LoRA. XLoraGGUF { - /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path. + /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file. + /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, + /// removing all remote accesses. #[arg(short, long)] tok_model_id: Option<String>, - /// Path to local tokenizer.json file. If this is specified it is used over any remote file. - #[arg(long)] - tokenizer_json: Option<String>, - /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. /// This may be a HF hub repo or a local path. #[arg(short = 'm', long)] @@ -156,14 +152,12 @@ pub enum ModelSelected { /// Select a GGUF model with LoRA. LoraGGUF { - /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path. + /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file. + /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, + /// removing all remote accesses. #[arg(short, long)] tok_model_id: Option<String>, - /// Path to local tokenizer.json file. If this is specified it is used over any remote file. - #[arg(long)] - tokenizer_json: Option<String>, - /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. /// This may be a HF hub repo or a local path. #[arg(short = 'm', long)] diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs index 6e104836c1..ae3bb9dcaa 100644 --- a/mistralrs-core/src/pipeline/gguf.rs +++ b/mistralrs-core/src/pipeline/gguf.rs @@ -14,7 +14,7 @@ use crate::prefix_cacher::PrefixCacheManager; use crate::sequence::Sequence; use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters}; use crate::xlora_models::NonGranularState; -use crate::{do_sample, get_mut_arcmutex, get_paths, DeviceMapMetadata, DEBUG}; +use crate::{do_sample, get_mut_arcmutex, get_paths_gguf, DeviceMapMetadata, DEBUG}; use crate::{ models::quantized_llama::ModelWeights as QLlama, models::quantized_phi2::ModelWeights as QPhi, @@ -69,7 +69,6 @@ pub struct GGUFLoader { xlora_order: Option<Ordering>, no_kv_cache: bool, chat_template: Option<String>, - tokenizer_json: Option<String>, kind: ModelKind, tgt_non_granular_index: Option<usize>, } @@ -119,24 +118,24 @@ pub struct GGUFLoaderBuilder { xlora_order: Option<Ordering>, no_kv_cache: bool, chat_template: Option<String>, - tokenizer_json: Option<String>, tgt_non_granular_index: Option<usize>, } impl GGUFLoaderBuilder { + /// Create a loader builder for a GGUF model. `tok_model_id` is the model ID where you can find a + /// `tokenizer_config.json` file. If the `chat_template` is specified, then it will be treated as a + /// path and used over remote files, removing all remote accesses. pub fn new( config: GGUFSpecificConfig, chat_template: Option<String>, - tokenizer_json: Option<String>, - model_id: Option<String>, + tok_model_id: Option<String>, quantized_model_id: String, quantized_filename: String, ) -> Self { Self { config, chat_template, - tokenizer_json, - model_id, + model_id: tok_model_id, kind: ModelKind::QuantizedGGUF, quantized_filename, quantized_model_id, @@ -197,7 +196,6 @@ impl GGUFLoaderBuilder { xlora_order: self.xlora_order, no_kv_cache: self.no_kv_cache, chat_template: self.chat_template, - tokenizer_json: self.tokenizer_json, tgt_non_granular_index: self.tgt_non_granular_index, quantized_filename: Some(self.quantized_filename), quantized_model_id: Some(self.quantized_model_id), @@ -217,7 +215,6 @@ impl GGUFLoader { xlora_order: Option<Ordering>, no_kv_cache: bool, chat_template: Option<String>, - tokenizer_json: Option<String>, tgt_non_granular_index: Option<usize>, ) -> Self { let model_id = if let Some(id) = model_id { @@ -238,7 +235,6 @@ impl GGUFLoader { xlora_order, no_kv_cache, chat_template, - tokenizer_json, kind, tgt_non_granular_index, } @@ -279,7 +275,7 @@ impl Loader for GGUFLoader { mapper: DeviceMapMetadata, in_situ_quant: Option<GgmlDType>, ) -> Result<Arc<Mutex<dyn Pipeline + Send + Sync>>> { - let paths: anyhow::Result<Box<dyn ModelPaths>> = get_paths!( + let paths: anyhow::Result<Box<dyn ModelPaths>> = get_paths_gguf!( LocalModelPaths, &token_source, revision, diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs index 25068ccad1..7f8f663d59 100644 --- a/mistralrs-core/src/pipeline/macros.rs +++ b/mistralrs-core/src/pipeline/macros.rs @@ -138,6 +138,89 @@ macro_rules! get_paths { }}; } +#[macro_export] +macro_rules! get_paths_gguf { + ($path_name:ident, $token_source:expr, $revision:expr, $this:expr, $quantized_model_id:expr, $quantized_filename:expr, $silent:expr) => {{ + let api = ApiBuilder::new() + .with_progress(!$silent) + .with_token(get_token($token_source)?) + .build()?; + let revision = $revision.unwrap_or("main".to_string()); + let api = api.repo(Repo::with_revision( + $this.model_id.clone(), + RepoType::Model, + revision.clone(), + )); + let model_id = std::path::Path::new(&$this.model_id); + + let chat_template = if let Some(ref p) = $this.chat_template { + if p.ends_with(".json") { + info!("Using chat template file at `{p}`"); + PathBuf::from_str(p)? + } else { + PathBuf::from_str("")? + } + } else { + $crate::api_get_file!( + api, + "tokenizer_config.json", + model_id + ) // Will be loaded from inside gguf file + }; + + let filenames = get_model_paths( + revision.clone(), + &$token_source, + &$quantized_model_id, + &$quantized_filename, + &api, + &model_id, + )?; + + let XLoraPaths { + adapter_configs, + adapter_safetensors, + classifier_path, + xlora_order, + xlora_config, + lora_preload_adapter_info, + } = get_xlora_paths( + $this.model_id.clone(), + &$this.xlora_model_id, + &$token_source, + revision.clone(), + &$this.xlora_order, + )?; + + let gen_conf = if $crate::api_dir_list!(api, model_id) + .collect::<Vec<_>>() + .contains(&"generation_config.json".to_string()) + { + Some($crate::api_get_file!( + api, + "generation_config.json", + model_id + )) + } else { + None + }; + + Ok(Box::new($path_name { + tokenizer_filename: PathBuf::from_str("")?, + config_filename: PathBuf::from_str("")?, + filenames, + xlora_adapter_configs: adapter_configs, + xlora_adapter_filenames: adapter_safetensors, + classifier_path, + classifier_config: xlora_config, + xlora_ordering: xlora_order, + template_filename: chat_template, + gen_conf, + lora_preload_adapter_info, + })) + }}; +} + #[macro_export] macro_rules! normal_model_loader { ($paths:expr, $dtype:expr, $default_dtype:expr, $device:expr, $config:expr, $loader:expr, $use_flash_attn:expr, $silent:expr, $mapper:expr, $loading_isq:expr, $real_device:expr) => {{ diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index 68b94bb089..d06b91af77 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -1298,8 +1298,20 @@ pub(crate) fn get_chat_template( paths: &Box<dyn ModelPaths>, chat_template: &Option<String>, ) -> ChatTemplate { + let template_filename = if paths.get_template_filename().to_string_lossy().is_empty() { + PathBuf::from( + chat_template + .as_ref() + .expect("A tokenizer config or chat template file path must be specified."), + ) + } else { + paths.get_template_filename().clone() + }; + if !template_filename.ends_with(".json") { + panic!("Template filename {template_filename:?} must end with `.json`."); + } let template: ChatTemplate = - serde_json::from_str(&fs::read_to_string(paths.get_template_filename()).unwrap()).unwrap(); + serde_json::from_str(&fs::read_to_string(&template_filename).unwrap()).unwrap(); #[derive(Debug, serde::Deserialize)] struct SpecifiedTemplate { @@ -1314,7 +1326,7 @@ pub(crate) fn get_chat_template( info!("`tokenizer_config.json` does not contain a chat template, attempting to use specified JINJA chat template."); let mut deser: HashMap<String, Value> = - serde_json::from_str(&fs::read_to_string(paths.get_template_filename()).unwrap()).unwrap(); + serde_json::from_str(&fs::read_to_string(&template_filename).unwrap()).unwrap(); match chat_template.clone() { Some(t) => { diff --git a/mistralrs-core/src/toml_selector.rs b/mistralrs-core/src/toml_selector.rs index 5bf67276ca..478d940eb0 100644 --- a/mistralrs-core/src/toml_selector.rs +++ b/mistralrs-core/src/toml_selector.rs @@ -65,7 +65,9 @@ enum TomlModelSelected { /// Select a GGUF model. #[allow(clippy::upper_case_acronyms)] GGUF { - /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path. + /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file. + /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, + /// removing all remote accesses. tok_model_id: String, /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. @@ -78,7 +80,9 @@ enum TomlModelSelected { /// Select a GGUF model with X-LoRA. XLoraGGUF { - /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path. + /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file. + /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, + /// removing all remote accesses. tok_model_id: Option<String>, /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. @@ -101,7 +105,9 @@ enum TomlModelSelected { /// Select a GGUF model with LoRA. LoraGGUF { - /// Model ID to load the tokenizer from. This may be a HF hub repo or a local path. + /// `tok_model_id` is the local or remote model ID where you can find a `tokenizer_config.json` file. + /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, + /// removing all remote accesses. tok_model_id: Option<String>, /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. @@ -299,7 +305,6 @@ fn loader_from_selected( repeat_last_n: args.repeat_last_n, }, args.chat_template, - args.tokenizer_json, Some(tok_model_id), quantized_model_id, quantized_filename, @@ -317,7 +322,6 @@ fn loader_from_selected( repeat_last_n: args.repeat_last_n, }, args.chat_template, - args.tokenizer_json, tok_model_id, quantized_model_id, quantized_filename, @@ -343,7 +347,6 @@ fn loader_from_selected( repeat_last_n: args.repeat_last_n, }, args.chat_template, - args.tokenizer_json, tok_model_id, quantized_model_id, quantized_filename, diff --git a/mistralrs-pyo3/API.md b/mistralrs-pyo3/API.md index 7d03873482..359ac00e80 100644 --- a/mistralrs-pyo3/API.md +++ b/mistralrs-pyo3/API.md @@ -22,11 +22,13 @@ Additionally, for models without quantization, the model architecture should be ```py class Which(Enum): + @dataclass class Plain: model_id: str arch: Architecture tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class XLora: arch: Architecture xlora_model_id: str @@ -35,6 +37,7 @@ class Which(Enum): model_id: str | None = None tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class Lora: arch: Architecture adapters_model_id: str @@ -42,12 +45,13 @@ class Which(Enum): model_id: str | None = None tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class GGUF: tok_model_id: str quantized_model_id: str quantized_filename: str - tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class XLoraGGUF: tok_model_id: str quantized_model_id: str @@ -55,22 +59,23 @@ class Which(Enum): xlora_model_id: str order: str tgt_non_granular_index: int | None = None - tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class LoraGGUF: tok_model_id: str quantized_model_id: str quantized_filename: str adapters_model_id: str order: str - tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class GGML: tok_model_id: str quantized_model_id: str quantized_filename: str tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class XLoraGGML: tok_model_id: str quantized_model_id: str @@ -80,6 +85,7 @@ class Which(Enum): tgt_non_granular_index: int | None = None tokenizer_json: str | None = None repeat_last_n: int = 64 + @dataclass class LoraGGML: tok_model_id: str quantized_model_id: str diff --git a/mistralrs-pyo3/mistralrs.pyi b/mistralrs-pyo3/mistralrs.pyi index a1239a557d..f1d7c46c7c 100644 --- a/mistralrs-pyo3/mistralrs.pyi +++ b/mistralrs-pyo3/mistralrs.pyi @@ -96,7 +96,6 @@ class Which(Enum): tok_model_id: str quantized_model_id: str quantized_filename: str - tokenizer_json: str | None = None repeat_last_n: int = 64 @dataclass class XLoraGGUF: @@ -106,7 +105,6 @@ class Which(Enum): xlora_model_id: str order: str tgt_non_granular_index: int | None = None - tokenizer_json: str | None = None repeat_last_n: int = 64 @dataclass class LoraGGUF: @@ -115,7 +113,6 @@ class Which(Enum): quantized_filename: str adapters_model_id: str order: str - tokenizer_json: str | None = None repeat_last_n: int = 64 @dataclass class GGML: diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs index b1e5f8a832..aa61d5d7fe 100644 --- a/mistralrs-pyo3/src/lib.rs +++ b/mistralrs-pyo3/src/lib.rs @@ -167,7 +167,6 @@ fn parse_which( .build(arch.into()), Which::GGUF { tok_model_id, - tokenizer_json, quantized_model_id, quantized_filename, repeat_last_n, @@ -176,7 +175,6 @@ fn parse_which( repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT), }, chat_template, - tokenizer_json, Some(tok_model_id), quantized_model_id, quantized_filename, @@ -184,7 +182,6 @@ fn parse_which( .build(), Which::XLoraGGUF { tok_model_id, - tokenizer_json, quantized_model_id, quantized_filename, repeat_last_n, @@ -196,7 +193,6 @@ fn parse_which( repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT), }, chat_template, - tokenizer_json, tok_model_id, quantized_model_id, quantized_filename, @@ -214,7 +210,6 @@ fn parse_which( .build(), Which::LoraGGUF { tok_model_id, - tokenizer_json, quantized_model_id, quantized_filename, repeat_last_n, @@ -225,7 +220,6 @@ fn parse_which( repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT), }, chat_template, - tokenizer_json, tok_model_id, quantized_model_id, quantized_filename, diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs index f7def2cfb9..98bce20d87 100644 --- a/mistralrs-pyo3/src/which.rs +++ b/mistralrs-pyo3/src/which.rs @@ -57,7 +57,6 @@ pub enum Which { #[allow(clippy::upper_case_acronyms)] GGUF { tok_model_id: String, - tokenizer_json: Option<String>, quantized_model_id: String, quantized_filename: String, repeat_last_n: Option<usize>, @@ -65,7 +64,6 @@ pub enum Which { XLoraGGUF { tok_model_id: Option<String>, - tokenizer_json: Option<String>, quantized_model_id: String, quantized_filename: String, repeat_last_n: Option<usize>, @@ -76,7 +74,6 @@ pub enum Which { LoraGGUF { tok_model_id: Option<String>, - tokenizer_json: Option<String>, quantized_model_id: String, quantized_filename: String, repeat_last_n: Option<usize>, diff --git a/mistralrs/examples/quantized/main.rs b/mistralrs/examples/quantized/main.rs index 37f60ef01d..58f1ac92b4 100644 --- a/mistralrs/examples/quantized/main.rs +++ b/mistralrs/examples/quantized/main.rs @@ -12,7 +12,6 @@ fn setup() -> anyhow::Result<Arc<MistralRs>> { let loader = GGUFLoaderBuilder::new( GGUFSpecificConfig { repeat_last_n: 64 }, None, - None, Some("mistralai/Mistral-7B-Instruct-v0.1".to_string()), "TheBloke/Mistral-7B-Instruct-v0.1-GGUF".to_string(), "mistral-7b-instruct-v0.1.Q4_K_M.gguf".to_string(), From d8831239b62d93597e234a2ef9ab2e9c6ebd5ab0 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 08:38:28 -0400 Subject: [PATCH 13/23] Update docs for loading --- README.md | 10 ++++++---- mistralrs-core/src/model_loader.rs | 2 +- mistralrs-core/src/model_selected.rs | 2 +- mistralrs-pyo3/src/lib.rs | 2 +- mistralrs-pyo3/src/which.rs | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 256e5d369c..6bc804e6b0 100644 --- a/README.md +++ b/README.md @@ -240,16 +240,18 @@ This is passed in the following ways: If token cannot be loaded, no token will be used (i.e. effectively using `none`). -## Loading models from local files:** +## Loading models from local files: -You can also instruct mistral.rs to load models locally by modifying the `*_model_id` arguments or options: +You can also instruct mistral.rs to load models fully locally by modifying the `*_model_id` arguments or options: ```bash ./mistralrs_server --port 1234 plain -m . -a mistral ``` -or + +To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the +chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example: ```bash -./mistralrs-server gguf -m . -t . -f Phi-3-mini-128k-instruct-q4_K_M.gguf +./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf ``` Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option: diff --git a/mistralrs-core/src/model_loader.rs b/mistralrs-core/src/model_loader.rs index 3ab381ad97..3d61eb62cc 100644 --- a/mistralrs-core/src/model_loader.rs +++ b/mistralrs-core/src/model_loader.rs @@ -156,7 +156,7 @@ fn loader_from_model_selected(args: LoaderBuilder) -> anyhow::Result<Box<dyn Loa } => GGUFLoaderBuilder::new( GGUFSpecificConfig { repeat_last_n }, args.chat_template, - Some(tok_model_id), + tok_model_id, quantized_model_id, quantized_filename, ) diff --git a/mistralrs-core/src/model_selected.rs b/mistralrs-core/src/model_selected.rs index 1bf68939d6..a9ed08e5d0 100644 --- a/mistralrs-core/src/model_selected.rs +++ b/mistralrs-core/src/model_selected.rs @@ -99,7 +99,7 @@ pub enum ModelSelected { /// If the `chat_template` is specified, then it will be treated as a path and used over remote files, /// removing all remote accesses. #[arg(short, long)] - tok_model_id: String, + tok_model_id: Option<String>, /// Quantized model ID to find the `quantized_filename`, only applicable if `quantized` is set. /// This may be a HF hub repo or a local path. diff --git a/mistralrs-pyo3/src/lib.rs b/mistralrs-pyo3/src/lib.rs index aa61d5d7fe..ae0ec9d3b6 100644 --- a/mistralrs-pyo3/src/lib.rs +++ b/mistralrs-pyo3/src/lib.rs @@ -175,7 +175,7 @@ fn parse_which( repeat_last_n: repeat_last_n.unwrap_or(REPEAT_LAST_N_DEFAULT), }, chat_template, - Some(tok_model_id), + tok_model_id, quantized_model_id, quantized_filename, ) diff --git a/mistralrs-pyo3/src/which.rs b/mistralrs-pyo3/src/which.rs index 98bce20d87..a5a33a6123 100644 --- a/mistralrs-pyo3/src/which.rs +++ b/mistralrs-pyo3/src/which.rs @@ -56,7 +56,7 @@ pub enum Which { #[allow(clippy::upper_case_acronyms)] GGUF { - tok_model_id: String, + tok_model_id: Option<String>, quantized_model_id: String, quantized_filename: String, repeat_last_n: Option<usize>, From bf308d476e13e3ca5f80e9dcf0af38f7c0cca905 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 12:46:30 -0400 Subject: [PATCH 14/23] Fix extension checking --- mistralrs-core/src/pipeline/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index d06b91af77..9d7dbee835 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -1307,7 +1307,12 @@ pub(crate) fn get_chat_template( } else { paths.get_template_filename().clone() }; - if !template_filename.ends_with(".json") { + if template_filename + .extension() + .expect("Template filename must be a file") + .to_string_lossy() + != "json" + { panic!("Template filename {template_filename:?} must end with `.json`."); } let template: ChatTemplate = From ec4ccb9ac31e8240e301aaf0321566b80555425f Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 14:07:46 -0400 Subject: [PATCH 15/23] Add some tests --- mistralrs-core/Cargo.toml | 1 + mistralrs-core/src/pipeline/gguf_tokenizer.rs | 165 +++++++++++++++++- mistralrs-core/src/sampler.rs | 6 +- 3 files changed, 159 insertions(+), 13 deletions(-) diff --git a/mistralrs-core/Cargo.toml b/mistralrs-core/Cargo.toml index 2ba0475874..9d4c9c1999 100644 --- a/mistralrs-core/Cargo.toml +++ b/mistralrs-core/Cargo.toml @@ -56,6 +56,7 @@ toml = "0.8.12" strum = { version = "0.26", features = ["derive"] } derive_more = { version = "0.99.17", default-features = false, features = ["from"] } tracing-subscriber.workspace = true +reqwest = { version = "0.12.4", features = ["blocking"] } [features] pyo3_macros = ["pyo3"] diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index b693bb1544..3bb97fa9bf 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -3,10 +3,10 @@ use std::sync::atomic::Ordering; use anyhow::Result; use candle_core::quantized::gguf_file::Content; use tokenizers::{ - decoders::{byte_fallback::ByteFallback, sequence::Sequence, strip::Strip}, + decoders::{self, byte_fallback::ByteFallback, fuse::Fuse, strip::Strip}, models::unigram::Unigram, - normalizers::Replace, - DecoderWrapper, ModelWrapper, Tokenizer, + normalizers::{self, Prepend, Replace}, + AddedToken, DecoderWrapper, ModelWrapper, NormalizerWrapper, Tokenizer, }; use tracing::info; @@ -59,11 +59,11 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .to_u32() .expect("GGUF unk token is not u32"); - let _eos = content.metadata["tokenizer.ggml.eos_token_id"] + let eos = content.metadata["tokenizer.ggml.eos_token_id"] .to_u32() .expect("GGUF unk token is not u32"); - let _bos = content.metadata["tokenizer.ggml.bos_token_id"] + let bos = content.metadata["tokenizer.ggml.bos_token_id"] .to_u32() .expect("GGUF unk token is not u32"); @@ -74,17 +74,27 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .as_ref() .expect("Expect `tokenizer.ggml.scores` for `llama` unigram tokeizer."); let mut vocab = Vec::new(); - for (token, score) in tokens.into_iter().zip(scores) { - vocab.push((token, *score as f64)); + for (token, score) in tokens.iter().zip(scores) { + vocab.push((token.clone(), *score as f64)); } let unigram = Unigram::from(vocab, Some(unk as usize), true).map_err(anyhow::Error::msg)?; let mut tokenizer = Tokenizer::new(ModelWrapper::Unigram(unigram)); - tokenizer.with_decoder(Sequence::new(vec![ + tokenizer.with_decoder(decoders::sequence::Sequence::new(vec![ DecoderWrapper::Replace(Replace::new("▁", " ").map_err(anyhow::Error::msg)?), DecoderWrapper::ByteFallback(ByteFallback::new()), + DecoderWrapper::Fuse(Fuse::new()), DecoderWrapper::Strip(Strip::new(' ', 1, 0)), ])); + tokenizer.with_normalizer(normalizers::Sequence::new(vec![ + NormalizerWrapper::Prepend(Prepend::new("▁".to_string())), + NormalizerWrapper::Replace(Replace::new(" ", "▁").map_err(anyhow::Error::msg)?), + ])); + + tokenizer.add_special_tokens(&[AddedToken::from(tokens[bos as usize].clone(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from(tokens[eos as usize].clone(), true)]); + tokenizer.add_special_tokens(&[AddedToken::from(tokens[unk as usize].clone(), true)]); + (tokenizer, "unigram") } other => { @@ -104,3 +114,142 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { } Ok(tokenizer) } + +mod tests { + use anyhow::Result; + use candle_core::quantized::gguf_file::Content; + use hf_hub::{api::sync::ApiBuilder, Repo, RepoType}; + use tokenizers::Tokenizer; + + use super::convert_ggml_to_hf_tokenizer; + + #[allow(dead_code)] + #[derive(Debug)] + enum TokenizerType { + /// Mistral v0.1 tokenizer + Llama, + Replit, + Gpt2, + Rwkv, + } + + #[allow(dead_code)] + fn get_gguf_tokenizer(tokenizer: TokenizerType) -> Result<Tokenizer> { + match tokenizer { + TokenizerType::Llama => { + let api = ApiBuilder::new().with_progress(true).build().unwrap(); + let api = api.repo(Repo::with_revision( + "TheBloke/Mistral-7B-Instruct-v0.1-GGUF".to_string(), + RepoType::Model, + "main".to_string(), + )); + + let filename = api.get("mistral-7b-instruct-v0.1.Q2_K.gguf").unwrap(); + let mut file = std::fs::File::open(&filename)?; + convert_ggml_to_hf_tokenizer( + &Content::read(&mut file) + .map_err(|e| e.with_path(filename)) + .map_err(anyhow::Error::msg)?, + ) + .map_err(anyhow::Error::msg) + } + other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"), + } + } + + #[allow(dead_code)] + fn get_hf_tokenizer(tokenizer: TokenizerType) -> Result<Tokenizer> { + match tokenizer { + TokenizerType::Llama => { + let api = ApiBuilder::new().with_progress(true).build().unwrap(); + let api = api.repo(Repo::with_revision( + "EricB/mistralrs_tests".to_string(), + RepoType::Model, + "main".to_string(), + )); + + let tokenizer_filename = api.get("tokenizer.json").unwrap(); + Ok(Tokenizer::from_file(tokenizer_filename).unwrap()) + } + other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"), + } + } + + #[allow(dead_code)] + fn get_test_passage() -> String { + let passage = reqwest::blocking::get("https://loripsum.net/api") + .expect("Failed to download sample text") + .bytes() + .expect("Failed to get bytes"); + String::from_utf8(passage.to_vec()).expect("Failed to convert sample text to string.") + } + + #[test] + fn test_encode_llama() -> Result<()> { + let passage = get_test_passage(); + let hf_tokenizer = get_hf_tokenizer(TokenizerType::Llama)?; + let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Llama)?; + + // Without special tokens + let hf_tokenized = hf_tokenizer + .encode(passage.as_str(), false) + .map_err(anyhow::Error::msg)?; + let gguf_tokenized = gguf_tokenizer + .encode(passage.as_str(), false) + .map_err(anyhow::Error::msg)?; + let hf_decoded = hf_tokenizer + .decode(hf_tokenized.get_ids(), false) + .map_err(anyhow::Error::msg)?; + let gguf_decoded = gguf_tokenizer + .decode(gguf_tokenized.get_ids(), false) + .map_err(anyhow::Error::msg)?; + assert_eq!(hf_decoded, gguf_decoded); + + // With special tokens + let hf_tokenized = hf_tokenizer + .encode(passage.as_str(), true) + .map_err(anyhow::Error::msg)?; + let gguf_tokenized = gguf_tokenizer + .encode(passage.as_str(), true) + .map_err(anyhow::Error::msg)?; + let hf_decoded = hf_tokenizer + .decode(hf_tokenized.get_ids(), true) + .map_err(anyhow::Error::msg)?; + let gguf_decoded = gguf_tokenizer + .decode(gguf_tokenized.get_ids(), true) + .map_err(anyhow::Error::msg)?; + assert_eq!(hf_decoded, gguf_decoded); + Ok(()) + } + + #[test] + fn test_decode() -> Result<()> { + use rand::seq::SliceRandom; + use rand::thread_rng; + + let hf_tokenizer = get_hf_tokenizer(TokenizerType::Llama)?; + let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Llama)?; + + let mut tokens = (0..hf_tokenizer.get_vocab_size(false) as u32).collect::<Vec<_>>(); + tokens.shuffle(&mut thread_rng()); + + // Without skipping special tokens + let hf_decoded = hf_tokenizer + .decode(&tokens, false) + .map_err(anyhow::Error::msg)?; + let gguf_decoded = gguf_tokenizer + .decode(&tokens, false) + .map_err(anyhow::Error::msg)?; + assert_eq!(hf_decoded, gguf_decoded); + + // With skipping special tokens + let hf_decoded = hf_tokenizer + .decode(&tokens, true) + .map_err(anyhow::Error::msg)?; + let gguf_decoded = gguf_tokenizer + .decode(&tokens, true) + .map_err(anyhow::Error::msg)?; + assert_eq!(hf_decoded, gguf_decoded); + Ok(()) + } +} diff --git a/mistralrs-core/src/sampler.rs b/mistralrs-core/src/sampler.rs index 520b139f0c..a8da56c100 100644 --- a/mistralrs-core/src/sampler.rs +++ b/mistralrs-core/src/sampler.rs @@ -413,11 +413,7 @@ mod tests { #[allow(dead_code)] fn get_tokenizer() -> Tokenizer { - let api = ApiBuilder::new() - .with_progress(true) - .with_token(Some(std::env::var("TESTS_HF_TOKEN").unwrap())) - .build() - .unwrap(); + let api = ApiBuilder::new().with_progress(true).build().unwrap(); let api = api.repo(Repo::with_revision( "EricB/mistralrs_tests".to_string(), RepoType::Model, From e0551d3fdca1704fbdf6372565666af3e06b1427 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 14:21:06 -0400 Subject: [PATCH 16/23] Update test --- mistralrs-core/src/pipeline/gguf_tokenizer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 3bb97fa9bf..5d6e644985 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -230,6 +230,7 @@ mod tests { let hf_tokenizer = get_hf_tokenizer(TokenizerType::Llama)?; let gguf_tokenizer = get_gguf_tokenizer(TokenizerType::Llama)?; + #[allow(clippy::cast_possible_truncation)] let mut tokens = (0..hf_tokenizer.get_vocab_size(false) as u32).collect::<Vec<_>>(); tokens.shuffle(&mut thread_rng()); From 6c832d15dc70202736b88e060108080f79bce972 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 14:27:25 -0400 Subject: [PATCH 17/23] Update docs --- README.md | 22 +++++++++++++------ mistralrs-core/src/pipeline/gguf_tokenizer.rs | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6bc804e6b0..55edf13427 100644 --- a/README.md +++ b/README.md @@ -247,13 +247,6 @@ You can also instruct mistral.rs to load models fully locally by modifying the ` ./mistralrs_server --port 1234 plain -m . -a mistral ``` -To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the -chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example: - -```bash -./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf -``` - Throughout mistral.rs, any model ID argument or option may be a local path and should contain the following files for each model ID option: - `--model-id` (server) or `model_id` (python/rust) or `--tok-model-id` (server) or `tok_model_id` (python/rust): - `config.json` @@ -269,6 +262,21 @@ Throughout mistral.rs, any model ID argument or option may be a local path and s - `--adapters-model-id` (server) or `adapters_model_id` (python/rust): - Adapters `.safetensors` and `adapter_config.json` files in their respective directories +## Running GGUF models locally + +To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the +chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example: + +```bash +./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf +``` + +The following tokenizer model types are currently supported. If you would like one to be added, please raise an issue. Otherwise, +please consider using the method demonstrated in examples below, where the tokenizer is sourced from Hugging Face. + +**Supported GGUF tokenizer types** +- `llama` + ### Run To start a server serving Mistral GGUF on `localhost:1234`, diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 5d6e644985..1a8333616d 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -223,7 +223,7 @@ mod tests { } #[test] - fn test_decode() -> Result<()> { + fn test_decode_llama() -> Result<()> { use rand::seq::SliceRandom; use rand::thread_rng; From 30055fff8aacbc402284482b391b995ce04c61e0 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 14:38:52 -0400 Subject: [PATCH 18/23] Update readme --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 55edf13427..a75eb5279e 100644 --- a/README.md +++ b/README.md @@ -155,7 +155,7 @@ Please submit more benchmarks via raising an issue! ## Usage ### Installation and Build -To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` when using the server to enable automatic download of gated models. +To install mistral.rs, one should ensure they have Rust installed by following [this](https://rustup.rs/) link. Additionally, the Hugging Face token should be provided in `~/.cache/huggingface/token` by running `huggingface-cli login` to enable automatic download of gated models. 1) Install required packages - `openssl` (ex., `sudo apt install libssl-dev`) @@ -169,9 +169,7 @@ To install mistral.rs, one should ensure they have Rust installed by following [ 3) Set HF token correctly (skip if already set or your model is not gated, or if you want to use the `token_source` parameters in Python or the command line.) ```bash - mkdir ~/.cache/huggingface - touch ~/.cache/huggingface/token - echo <HF_TOKEN_HERE> > ~/.cache/huggingface/token + huggingface-cli login ``` 4) Download the code @@ -220,6 +218,7 @@ To install mistral.rs, one should ensure they have Rust installed by following [ You can install Python support by following the guide [here](mistralrs-pyo3/README.md). +## Getting models ### Getting models from HF Hub Mistral.rs can automatically download models from HF Hub. To access gated models, you should provide a token source. They may be one of: @@ -240,7 +239,7 @@ This is passed in the following ways: If token cannot be loaded, no token will be used (i.e. effectively using `none`). -## Loading models from local files: +### Loading models from local files: You can also instruct mistral.rs to load models fully locally by modifying the `*_model_id` arguments or options: ```bash @@ -262,10 +261,10 @@ Throughout mistral.rs, any model ID argument or option may be a local path and s - `--adapters-model-id` (server) or `adapters_model_id` (python/rust): - Adapters `.safetensors` and `adapter_config.json` files in their respective directories -## Running GGUF models locally +### Running GGUF models locally To run GGUF models fully locally, you do not need to specify the tokenizer model ID argument and instead should pass a path to the -chat template JSON file (examples [here](chat_templates)) as well as specifying a local model ID. For example: +chat template JSON file (examples [here](chat_templates), you will need to create your own by specifying the chat template and `bos`/`eos` tokens) as well as specifying a local model ID. For example: ```bash ./mistralrs-server --chat-template <chat_template> gguf -m . -f Phi-3-mini-128k-instruct-q4_K_M.gguf @@ -277,7 +276,7 @@ please consider using the method demonstrated in examples below, where the token **Supported GGUF tokenizer types** - `llama` -### Run +## Run To start a server serving Mistral GGUF on `localhost:1234`, ```bash From c374297fa6f087d0ef3b4cee94c3a95504843fa6 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 14:41:48 -0400 Subject: [PATCH 19/23] Update readme --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a75eb5279e..80c2b16c5b 100644 --- a/README.md +++ b/README.md @@ -219,7 +219,12 @@ To install mistral.rs, one should ensure they have Rust installed by following [ You can install Python support by following the guide [here](mistralrs-pyo3/README.md). ## Getting models -### Getting models from HF Hub + +There are 2 ways to run a model with mistral.rs: +- From Hugging Face Hub (easiest) +- From local files + +### Getting models from Hugging Face Hub Mistral.rs can automatically download models from HF Hub. To access gated models, you should provide a token source. They may be one of: - `literal:<value>`: Load from a specified literal @@ -299,7 +304,7 @@ Additionally, for models without quantization, the model architecture should be You can launch interactive mode, a simple chat application running in the terminal, by passing `-i`: ```bash -./mistralrs_server -i gguf -t mistralai/Mistral-7B-Instruct-v0.1 -m TheBloke/Mistral-7B-Instruct-v0.1-GGUF -f mistral-7b-instruct-v0.1.Q4_K_M.gguf +./mistralrs_server -i plain -m microsoft/Phi-3-mini-128k-instruct -a phi3 ``` ### Quick examples: @@ -342,7 +347,7 @@ To start a server running Llama from GGML: To start a server running Mistral from safetensors. ```bash -./mistralrs_server --port 1234 gguf -m mistralai/Mistral-7B-Instruct-v0.1 +./mistralrs_server --port 1234 plain -m mistralai/Mistral-7B-Instruct-v0.1 -a mistral ``` ### Structured selection with a `.toml` file From 71bdd2f1bff086e0fa850caf4fcc0278b528ba29 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 15:24:25 -0400 Subject: [PATCH 20/23] Bump version --- Cargo.toml | 2 +- mistralrs-bench/Cargo.toml | 2 +- mistralrs-pyo3/Cargo.toml | 2 +- mistralrs-pyo3/Cargo_template.toml | 2 +- mistralrs-pyo3/pyproject.toml | 2 +- mistralrs-pyo3/pyproject_template.toml | 2 +- mistralrs-server/Cargo.toml | 2 +- mistralrs/Cargo.toml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 297b55b114..f9583c7f55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.1.10" +version = "0.1.11" edition = "2021" description = "Fast and easy LLM serving." homepage = "https://github.com/EricLBuehler/mistral.rs" diff --git a/mistralrs-bench/Cargo.toml b/mistralrs-bench/Cargo.toml index 875f7b78b1..8f53baff22 100644 --- a/mistralrs-bench/Cargo.toml +++ b/mistralrs-bench/Cargo.toml @@ -17,7 +17,7 @@ candle-core.workspace = true serde.workspace = true serde_json.workspace = true clap.workspace = true -mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" } +mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" } tracing.workspace = true either.workspace = true tokio.workspace = true diff --git a/mistralrs-pyo3/Cargo.toml b/mistralrs-pyo3/Cargo.toml index cf32d63295..e9d2945442 100644 --- a/mistralrs-pyo3/Cargo.toml +++ b/mistralrs-pyo3/Cargo.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.1.10", path = "../mistralrs-core", features = ["pyo3_macros"] } +mistralrs-core = { version = "0.1.11", path = "../mistralrs-core", features = ["pyo3_macros"] } serde.workspace = true serde_json.workspace = true candle-core.workspace = true diff --git a/mistralrs-pyo3/Cargo_template.toml b/mistralrs-pyo3/Cargo_template.toml index 6944b192fd..a3a52a0eb1 100644 --- a/mistralrs-pyo3/Cargo_template.toml +++ b/mistralrs-pyo3/Cargo_template.toml @@ -17,7 +17,7 @@ doc = false [dependencies] pyo3.workspace = true -mistralrs-core = { version = "0.1.10", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } +mistralrs-core = { version = "0.1.11", path = "../mistralrs-core", features=["pyo3_macros","$feature_name"] } serde.workspace = true serde_json.workspace = true candle-core = { git = "https://github.com/EricLBuehler/candle.git", version = "0.5.0", features=["$feature_name"] } diff --git a/mistralrs-pyo3/pyproject.toml b/mistralrs-pyo3/pyproject.toml index 3aa089e4c9..bf62a1349e 100644 --- a/mistralrs-pyo3/pyproject.toml +++ b/mistralrs-pyo3/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "mistralrs" -version = "0.1.10" +version = "0.1.11" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-pyo3/pyproject_template.toml b/mistralrs-pyo3/pyproject_template.toml index 01e9848235..b8afe9f18a 100644 --- a/mistralrs-pyo3/pyproject_template.toml +++ b/mistralrs-pyo3/pyproject_template.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "$name" -version = "0.1.10" +version = "0.1.11" requires-python = ">=3.8" classifiers = [ "Programming Language :: Rust", diff --git a/mistralrs-server/Cargo.toml b/mistralrs-server/Cargo.toml index a7fa730c3f..86fe006fd0 100644 --- a/mistralrs-server/Cargo.toml +++ b/mistralrs-server/Cargo.toml @@ -22,7 +22,7 @@ axum = { version = "0.7.4", features = ["tokio"] } tower-http = { version = "0.5.1", features = ["cors"]} utoipa = { version = "4.2", features = ["axum_extras"] } utoipa-swagger-ui = { version = "7.1.0", features = ["axum"]} -mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" } +mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" } dyn-fmt = "0.4.0" indexmap.workspace = true accelerate-src = { workspace = true, optional = true } diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml index 1ceed5ec12..d105982bd9 100644 --- a/mistralrs/Cargo.toml +++ b/mistralrs/Cargo.toml @@ -12,7 +12,7 @@ license.workspace = true homepage.workspace = true [dependencies] -mistralrs-core = { version = "0.1.10", path = "../mistralrs-core" } +mistralrs-core = { version = "0.1.11", path = "../mistralrs-core" } anyhow.workspace = true tokio.workspace = true candle-core.workspace = true From cfe2fd3674c9853e8f3738d7705ed727586efa9d Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 20:10:38 -0400 Subject: [PATCH 21/23] Add examples readme --- examples/README.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 examples/README.md diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000..043a2211d8 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,4 @@ +# Examples +- Python: [examples here](python) +- HTTP Server: [examples here](server) +- Rust: [examples here](../mistralrs/examples/) \ No newline at end of file From ddba24b2813cb2614f9f40adacd59e96caabf917 Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Tue, 28 May 2024 21:12:23 -0400 Subject: [PATCH 22/23] Add an example and fixes --- chat_templates/llama2.json | 3 + chat_templates/llama3.json | 3 + chat_templates/mistral.json | 3 + chat_templates/phi3.json | 3 + mistralrs-core/src/pipeline/chat_template.rs | 29 ++++---- mistralrs-core/src/pipeline/gguf.rs | 72 +++++++++++++------ mistralrs-core/src/pipeline/gguf_tokenizer.rs | 21 +++++- mistralrs-core/src/pipeline/macros.rs | 20 ++++-- mistralrs-core/src/pipeline/mod.rs | 12 ++-- mistralrs/Cargo.toml | 4 ++ mistralrs/examples/gguf_locally/main.rs | 64 +++++++++++++++++ mistralrs/examples/quantized/main.rs | 1 + 12 files changed, 188 insertions(+), 47 deletions(-) create mode 100644 chat_templates/llama2.json create mode 100644 chat_templates/llama3.json create mode 100644 chat_templates/mistral.json create mode 100644 chat_templates/phi3.json create mode 100644 mistralrs/examples/gguf_locally/main.rs diff --git a/chat_templates/llama2.json b/chat_templates/llama2.json new file mode 100644 index 0000000000..800a077f2c --- /dev/null +++ b/chat_templates/llama2.json @@ -0,0 +1,3 @@ +{ + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}" +} \ No newline at end of file diff --git a/chat_templates/llama3.json b/chat_templates/llama3.json new file mode 100644 index 0000000000..61bafeb2ed --- /dev/null +++ b/chat_templates/llama3.json @@ -0,0 +1,3 @@ +{ + "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}" +} \ No newline at end of file diff --git a/chat_templates/mistral.json b/chat_templates/mistral.json new file mode 100644 index 0000000000..15544fda6b --- /dev/null +++ b/chat_templates/mistral.json @@ -0,0 +1,3 @@ +{ + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token + ' ' }}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" +} \ No newline at end of file diff --git a/chat_templates/phi3.json b/chat_templates/phi3.json new file mode 100644 index 0000000000..6d92f29e6e --- /dev/null +++ b/chat_templates/phi3.json @@ -0,0 +1,3 @@ +{ + "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" +} \ No newline at end of file diff --git a/mistralrs-core/src/pipeline/chat_template.rs b/mistralrs-core/src/pipeline/chat_template.rs index ee7dfa1155..e419b8901b 100644 --- a/mistralrs-core/src/pipeline/chat_template.rs +++ b/mistralrs-core/src/pipeline/chat_template.rs @@ -30,9 +30,9 @@ fn raise_exception(msg: String) -> Result<String, minijinja::Error> { } #[derive(Debug, Deserialize)] -pub struct Unk(#[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>); -#[derive(Debug, Deserialize)] -pub struct Bos(#[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>); +pub struct BeginEndUnkTok( + #[serde(with = "either::serde_untagged")] pub Either<String, AddedTokensDecoder>, +); #[allow(dead_code)] #[derive(Debug, Deserialize)] @@ -41,23 +41,22 @@ pub struct ChatTemplate { add_eos_token: Option<bool>, added_tokens_decoder: Option<HashMap<String, AddedTokensDecoder>>, additional_special_tokens: Option<Vec<String>>, - pub bos_token: Option<Bos>, + pub bos_token: Option<BeginEndUnkTok>, /// Jinja format chat templating for chat completion. /// See: https://huggingface.co/docs/transformers/chat_templating pub chat_template: Option<String>, clean_up_tokenization_spaces: Option<bool>, device_map: Option<String>, - #[serde(with = "either::serde_untagged")] - pub eos_token: Either<String, AddedTokensDecoder>, + pub eos_token: Option<BeginEndUnkTok>, legacy: Option<bool>, - model_max_length: f64, + model_max_length: Option<f64>, pad_token: Option<String>, sp_model_kwargs: Option<HashMap<String, String>>, spaces_between_special_tokens: Option<bool>, - tokenizer_class: String, + tokenizer_class: Option<String>, truncation_size: Option<String>, - pub unk_token: Option<Unk>, + pub unk_token: Option<BeginEndUnkTok>, use_default_system_prompt: Option<bool>, } @@ -66,10 +65,10 @@ impl ChatTemplate { self.chat_template.is_some() } - pub fn eos_tok(&self) -> String { - match self.eos_token { - Either::Left(ref lit) => lit.clone(), - Either::Right(ref added) => added.content.clone(), + pub fn eos_tok(&self) -> Option<String> { + match self.eos_token.as_ref()?.0 { + Either::Left(ref lit) => Some(lit.clone()), + Either::Right(ref added) => Some(added.content.clone()), } } @@ -93,7 +92,7 @@ pub fn calculate_eos_tokens( gen_conf: Option<GenerationConfig>, tokenizer: &Tokenizer, ) -> Vec<u32> { - let mut eos_tok_ids = vec![chat_template.eos_tok()]; + let mut eos_tok_ids = chat_template.eos_tok().map(|x| vec![x]).unwrap_or_default(); let mut bos_tok_ids = chat_template.bos_tok().map(|b| vec![b]).unwrap_or_default(); for alternate in SUPPORTED_ALTERNATE_EOS { @@ -173,7 +172,7 @@ pub fn apply_chat_template_to( add_generation_prompt: bool, template: &str, bos_tok: Option<String>, - eos_tok: &str, + eos_tok: Option<String>, unk_tok: Option<String>, ) -> Result<String> { let mut env = Environment::new(); diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs index ae3bb9dcaa..71520b1d6d 100644 --- a/mistralrs-core/src/pipeline/gguf.rs +++ b/mistralrs-core/src/pipeline/gguf.rs @@ -6,12 +6,13 @@ use super::{ use crate::aici::bintokens::build_tok_trie; use crate::aici::toktree::TokTrie; use crate::lora::Ordering; -use crate::pipeline::chat_template::{calculate_eos_tokens, GenerationConfig}; -use crate::pipeline::gguf_tokenizer::convert_ggml_to_hf_tokenizer; +use crate::pipeline::chat_template::{calculate_eos_tokens, BeginEndUnkTok, GenerationConfig}; +use crate::pipeline::gguf_tokenizer::{convert_ggml_to_hf_tokenizer, ConversionResult}; use crate::pipeline::{get_chat_template, Cache}; use crate::pipeline::{ChatTemplate, LocalModelPaths}; use crate::prefix_cacher::PrefixCacheManager; use crate::sequence::Sequence; +use crate::utils::tokenizer::get_tokenizer; use crate::utils::varbuilder_utils::{from_mmaped_safetensors, load_preload_adapters}; use crate::xlora_models::NonGranularState; use crate::{do_sample, get_mut_arcmutex, get_paths_gguf, DeviceMapMetadata, DEBUG}; @@ -28,6 +29,7 @@ use candle_core::quantized::{ GgmlDType, }; use candle_core::{DType, Device, Tensor}; +use either::Either; use hf_hub::{api::sync::ApiBuilder, Repo, RepoType}; use rand_isaac::Isaac64Rng; use std::fs; @@ -61,10 +63,10 @@ pub struct GGUFPipeline { } pub struct GGUFLoader { - model_id: String, + model_id: Option<String>, config: GGUFSpecificConfig, - quantized_model_id: Option<String>, - quantized_filename: Option<String>, + quantized_model_id: String, + quantized_filename: String, xlora_model_id: Option<String>, xlora_order: Option<Ordering>, no_kv_cache: bool, @@ -189,7 +191,7 @@ impl GGUFLoaderBuilder { pub fn build(self) -> Box<dyn Loader> { Box::new(GGUFLoader { - model_id: self.model_id.unwrap(), + model_id: self.model_id, config: self.config, xlora_model_id: self.xlora_model_id, kind: self.kind, @@ -197,8 +199,8 @@ impl GGUFLoaderBuilder { no_kv_cache: self.no_kv_cache, chat_template: self.chat_template, tgt_non_granular_index: self.tgt_non_granular_index, - quantized_filename: Some(self.quantized_filename), - quantized_model_id: Some(self.quantized_model_id), + quantized_filename: self.quantized_filename, + quantized_model_id: self.quantized_model_id, }) } } @@ -208,8 +210,8 @@ impl GGUFLoader { pub fn new( model_id: Option<String>, config: GGUFSpecificConfig, - quantized_model_id: Option<String>, - quantized_filename: Option<String>, + quantized_model_id: String, + quantized_filename: String, xlora_model_id: Option<String>, kind: ModelKind, xlora_order: Option<Ordering>, @@ -218,13 +220,15 @@ impl GGUFLoader { tgt_non_granular_index: Option<usize>, ) -> Self { let model_id = if let Some(id) = model_id { - id - } else { + Some(id) + } else if let Some(xlora_order) = xlora_order.clone() { info!( "Using adapter base model ID: `{}`", - xlora_order.as_ref().unwrap().base_model_id + xlora_order.base_model_id ); - xlora_order.as_ref().unwrap().base_model_id.clone() + Some(xlora_order.base_model_id.clone()) + } else { + None }; Self { model_id, @@ -280,8 +284,8 @@ impl Loader for GGUFLoader { &token_source, revision, self, - self.quantized_model_id, - self.quantized_filename, + self.quantized_model_id.clone(), + self.quantized_filename.clone(), silent ); self.load_model_from_path(&paths?, _dtype, device, silent, mapper, in_situ_quant) @@ -356,7 +360,21 @@ impl Loader for GGUFLoader { info!("Debug is enabled, wrote the names and information about each tensor to `mistralrs_gguf_tensors.txt`."); } - let tokenizer = convert_ggml_to_hf_tokenizer(&model)?; + let ConversionResult { + tokenizer, + bos, + eos, + unk, + } = if paths.get_tokenizer_filename().to_string_lossy().is_empty() { + convert_ggml_to_hf_tokenizer(&model)? + } else { + ConversionResult { + tokenizer: get_tokenizer(paths.get_tokenizer_filename())?, + bos: None, + eos: None, + unk: None, + } + }; let mut is_lora = false; let model = match self.kind { @@ -481,7 +499,7 @@ impl Loader for GGUFLoader { let gen_conf: Option<GenerationConfig> = paths .get_gen_conf_filename() .map(|f| serde_json::from_str(&fs::read_to_string(f).unwrap()).unwrap()); - let chat_template = get_chat_template(paths, &self.chat_template); + let mut chat_template = get_chat_template(paths, &self.chat_template); let max_seq_len = match model { Model::Llama(ref l) => l.max_seq_len, @@ -502,6 +520,17 @@ impl Loader for GGUFLoader { Model::Phi3(ref model) => model.cache.lock().len(), Model::XLoraPhi3(ref model) => model.cache.lock().len(), }; + + if chat_template.bos_token.is_none() && bos.is_some() { + chat_template.bos_token = Some(BeginEndUnkTok(Either::Left(bos.unwrap()))); + } + if chat_template.eos_token.is_none() && eos.is_some() { + chat_template.eos_token = Some(BeginEndUnkTok(Either::Left(eos.unwrap()))); + } + if chat_template.unk_token.is_none() && unk.is_some() { + chat_template.unk_token = Some(BeginEndUnkTok(Either::Left(unk.unwrap()))); + } + let eos = calculate_eos_tokens(&chat_template, gen_conf, &tokenizer); Ok(Arc::new(Mutex::new(GGUFPipeline { model, @@ -509,7 +538,10 @@ impl Loader for GGUFLoader { tokenizer: tokenizer.into(), no_kv_cache: self.no_kv_cache, chat_template: Arc::new(chat_template), - model_id: self.model_id.clone(), + model_id: self + .model_id + .clone() + .unwrap_or(self.quantized_model_id.clone()), non_granular_state: self.tgt_non_granular_index.map(|tgt_non_granular_index| { NonGranularState { non_granular_index: Arc::new(Mutex::new(0)), @@ -532,7 +564,7 @@ impl Loader for GGUFLoader { fn get_id(&self) -> String { self.xlora_model_id .as_deref() - .unwrap_or(&self.model_id) + .unwrap_or(self.model_id.as_ref().unwrap_or(&self.quantized_model_id)) .to_string() } diff --git a/mistralrs-core/src/pipeline/gguf_tokenizer.rs b/mistralrs-core/src/pipeline/gguf_tokenizer.rs index 1a8333616d..1d6985c1fc 100644 --- a/mistralrs-core/src/pipeline/gguf_tokenizer.rs +++ b/mistralrs-core/src/pipeline/gguf_tokenizer.rs @@ -12,7 +12,14 @@ use tracing::info; use crate::DEBUG; -pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { +pub struct ConversionResult { + pub tokenizer: Tokenizer, + pub bos: Option<String>, + pub eos: Option<String>, + pub unk: Option<String>, +} + +pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<ConversionResult> { let model = content.metadata["tokenizer.ggml.model"] .to_string() .expect("GGUF tokenizer model is not a string.") @@ -67,6 +74,10 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { .to_u32() .expect("GGUF unk token is not u32"); + let bos_str = tokens[bos as usize].clone(); + let eos_str = tokens[eos as usize].clone(); + let unk_str = tokens[unk as usize].clone(); + let (tokenizer, ty) = match model.as_str() { "llama" | "replit" => { // unigram @@ -112,7 +123,12 @@ pub fn convert_ggml_to_hf_tokenizer(content: &Content) -> Result<Tokenizer> { if DEBUG.load(Ordering::Relaxed) { info!("Tokenizer: {tokenizer:?}"); } - Ok(tokenizer) + Ok(ConversionResult { + tokenizer, + bos: Some(bos_str), + eos: Some(eos_str), + unk: Some(unk_str), + }) } mod tests { @@ -152,6 +168,7 @@ mod tests { .map_err(anyhow::Error::msg)?, ) .map_err(anyhow::Error::msg) + .map(|res| res.tokenizer) } other => anyhow::bail!("Cannot get testing HF tokenizer for type {other:?}"), } diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs index 7f8f663d59..6e29c940cd 100644 --- a/mistralrs-core/src/pipeline/macros.rs +++ b/mistralrs-core/src/pipeline/macros.rs @@ -146,12 +146,14 @@ macro_rules! get_paths_gguf { .with_token(get_token($token_source)?) .build()?; let revision = $revision.unwrap_or("main".to_string()); + let model_id_this = $this.model_id.clone().unwrap_or($this.quantized_model_id.clone()); + let model_id_copy = model_id_this.clone(); let api = api.repo(Repo::with_revision( - $this.model_id.clone(), + model_id_this.clone(), RepoType::Model, revision.clone(), )); - let model_id = std::path::Path::new(&$this.model_id); + let model_id = std::path::Path::new(&model_id_copy); let chat_template = if let Some(ref p) = $this.chat_template { if p.ends_with(".json") { @@ -171,8 +173,8 @@ macro_rules! get_paths_gguf { let filenames = get_model_paths( revision.clone(), &$token_source, - &$quantized_model_id, - &$quantized_filename, + &Some($quantized_model_id), + &Some($quantized_filename), &api, &model_id, )?; @@ -185,7 +187,7 @@ macro_rules! get_paths_gguf { xlora_config, lora_preload_adapter_info, } = get_xlora_paths( - $this.model_id.clone(), + model_id_this, &$this.xlora_model_id, &$token_source, revision.clone(), @@ -205,8 +207,14 @@ macro_rules! get_paths_gguf { None }; + let tokenizer_filename = if $this.model_id.is_some() { + $crate::api_get_file!(api, "tokenizer.json", model_id) + } else { + PathBuf::from_str("")? + }; + Ok(Box::new($path_name { - tokenizer_filename: PathBuf::from_str("")?, + tokenizer_filename, config_filename: PathBuf::from_str("")?, filenames, xlora_adapter_configs: adapter_configs, diff --git a/mistralrs-core/src/pipeline/mod.rs b/mistralrs-core/src/pipeline/mod.rs index 9d7dbee835..5dae166a37 100644 --- a/mistralrs-core/src/pipeline/mod.rs +++ b/mistralrs-core/src/pipeline/mod.rs @@ -598,9 +598,13 @@ pub trait Pipeline: Send + Sync { } else { None }; - let eos_tok = match chat_template.eos_token { - Either::Left(ref lit) => lit, - Either::Right(ref added) => &added.content, + let eos_tok = if let Some(ref unk) = self.get_chat_template().eos_token { + match unk.0 { + Either::Left(ref lit) => Some(lit.to_string()), + Either::Right(ref added) => Some(added.content.to_string()), + } + } else { + None }; let unk_tok = if let Some(ref unk) = self.get_chat_template().unk_token { match unk.0 { @@ -1436,7 +1440,7 @@ mod tests { true, template, Some(bos.to_string()), - eos, + Some(eos.to_string()), Some(unk.to_string()), ) .unwrap_or_else(|_| panic!("Template number {i}")); diff --git a/mistralrs/Cargo.toml b/mistralrs/Cargo.toml index d105982bd9..49027c7e05 100644 --- a/mistralrs/Cargo.toml +++ b/mistralrs/Cargo.toml @@ -52,4 +52,8 @@ required-features = [] [[example]] name = "lora_activation" +required-features = [] + +[[example]] +name = "gguf_locally" required-features = [] \ No newline at end of file diff --git a/mistralrs/examples/gguf_locally/main.rs b/mistralrs/examples/gguf_locally/main.rs new file mode 100644 index 0000000000..b04fc9fa53 --- /dev/null +++ b/mistralrs/examples/gguf_locally/main.rs @@ -0,0 +1,64 @@ +use std::sync::Arc; +use tokio::sync::mpsc::channel; + +use mistralrs::{ + Constraint, Device, DeviceMapMetadata, GGUFLoaderBuilder, GGUFSpecificConfig, MistralRs, + MistralRsBuilder, NormalRequest, Request, RequestMessage, Response, SamplingParams, + SchedulerMethod, TokenSource, +}; + +fn setup() -> anyhow::Result<Arc<MistralRs>> { + // Select a Mistral model + // We do not use any files from HF servers here, and instead load the + // chat template from the specified file, and the tokenizer and model from a + // local GGUF file at the path `.` + let loader = GGUFLoaderBuilder::new( + GGUFSpecificConfig { repeat_last_n: 64 }, + Some("chat_templates/mistral.json".to_string()), + None, + ".".to_string(), + "mistral-7b-instruct-v0.1.Q4_K_M.gguf".to_string(), + ) + .build(); + // Load, into a Pipeline + let pipeline = loader.load_model_from_hf( + None, + TokenSource::CacheToken, + None, + &Device::cuda_if_available(0)?, + false, + DeviceMapMetadata::dummy(), + None, + )?; + // Create the MistralRs, which is a runner + Ok(MistralRsBuilder::new(pipeline, SchedulerMethod::Fixed(5.try_into().unwrap())).build()) +} + +fn main() -> anyhow::Result<()> { + let mistralrs = setup()?; + + let (tx, mut rx) = channel(10_000); + let request = Request::Normal(NormalRequest { + messages: RequestMessage::Completion { + text: "Hello! My name is ".to_string(), + echo_prompt: false, + best_of: 1, + }, + sampling_params: SamplingParams::default(), + response: tx, + return_logprobs: false, + is_streaming: false, + id: 0, + constraint: Constraint::None, + suffix: None, + adapters: None, + }); + mistralrs.get_sender().blocking_send(request)?; + + let response = rx.blocking_recv().unwrap(); + match response { + Response::CompletionDone(c) => println!("Text: {}", c.choices[0].text), + _ => unreachable!(), + } + Ok(()) +} diff --git a/mistralrs/examples/quantized/main.rs b/mistralrs/examples/quantized/main.rs index 58f1ac92b4..b6539edaf2 100644 --- a/mistralrs/examples/quantized/main.rs +++ b/mistralrs/examples/quantized/main.rs @@ -9,6 +9,7 @@ use mistralrs::{ fn setup() -> anyhow::Result<Arc<MistralRs>> { // Select a Mistral model + // This uses a model, tokenizer, and chat template, from HF hub. let loader = GGUFLoaderBuilder::new( GGUFSpecificConfig { repeat_last_n: 64 }, None, From 813d83211e1391060261adde8de15b48cbe814cf Mon Sep 17 00:00:00 2001 From: EricLBuehler <ericlbuehler@gmail.com> Date: Wed, 29 May 2024 10:04:21 -0400 Subject: [PATCH 23/23] Allow unauth for local --- mistralrs-core/src/pipeline/macros.rs | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mistralrs-core/src/pipeline/macros.rs b/mistralrs-core/src/pipeline/macros.rs index 6e29c940cd..b9bf402f2f 100644 --- a/mistralrs-core/src/pipeline/macros.rs +++ b/mistralrs-core/src/pipeline/macros.rs @@ -11,14 +11,21 @@ macro_rules! api_dir_list { .unwrap_or_else(|e| { // If we do not get a 404, it was something else. let format = format!("{e:?}"); + let mut unauth = false; if let hf_hub::api::sync::ApiError::RequestError(resp) = e { - if resp.into_response().is_some_and(|r| r.status() != 404) { + let resp = resp.into_response(); + // If it's 401, assume that we're running locally only. + if resp.as_ref().is_some_and(|r| r.status() != 401) { + unauth = true; + } else if resp.as_ref().is_some_and(|r| r.status() != 404) { panic!("{format}"); } } let listing = std::fs::read_dir($model_id); - if listing.is_err() { + if listing.is_err() && unauth { + panic!("{format}"); + } else if listing.is_err() { panic!("Cannot list directory {:?}", $model_id) } let listing = listing.unwrap(); @@ -43,14 +50,21 @@ macro_rules! api_get_file { $api.get($file).unwrap_or_else(|e| { // If we do not get a 404, it was something else. let format = format!("{e:?}"); + let mut unauth = false; if let hf_hub::api::sync::ApiError::RequestError(resp) = e { - if resp.into_response().is_some_and(|r| r.status() != 404) { + let resp = resp.into_response(); + // If it's 401, assume that we're running locally only. + if resp.as_ref().is_some_and(|r| r.status() != 401) { + unauth = true; + } else if resp.as_ref().is_some_and(|r| r.status() != 404) { panic!("{format}"); } } let path = $model_id.join($file); - if !path.exists() { + if !path.exists() && unauth { + panic!("{format}"); + } else if !path.exists() { panic!("File \"{}\" not found at model id {:?}", $file, $model_id) } info!("Loading `{:?}` locally at `{path:?}`", &$file);